// file: kwic.java // author: Robert Keller // purpose: Port kwic index application from rex to Java. import java.io.*; /** * Class kwic defines three primary methods: * */ class kwic { /** * A Character version of a blank space */ static private Character blank = new Character(' '); /** * The list of noise words if no filename is specifed on the command line. */ static public OpenList noiseDefault = OpenList.nil; /** * The number of spaces to be used for the left part of the title if not * specified on the command line. */ static public int leftDefault = 35; /** * The number of spaces to be used for the right part of the title if not * specified on the command line. */ static public int rightDefault = 35; /** * kwic creates a kwic index from a list of titles, in the form of an OpenList. * Excluded are instances of titles that focus on a word in the list of noise * words. * @param Noise OpenList of noise words * @param Titles OpenList of titles, each title being a single String. * @return an OpenList kwic index. Each element of the index consists of * a list of words before the keyword, a list of words including and following * the keyword, and a reference number indicating the position of the title in * the original list, starting with 0. */ // rex: kwic(Noise, Titles) = // rex: sort(compare, splitTitles(Noise, number(parse(Titles)))); static OpenList kwic(OpenList Noise, OpenList Titles) { return splitTitles(Noise, number(parse(Titles))).sort(new TitleComparator()); } /** * number numbers the elements of any open list. * @param L an OpenList of elements to be numbered. * @return an OpenList of consisting of elements of L paired with successive * numbers starting at 0. The pairings are themselves OpenLists, with the * original element as the first and the number as the second. The numbers * are in class Integer. */ // rex: number(List) = number(List, 0); static OpenList number(OpenList L) { return number(L, 0); } /** * auxiliary function for number of 1 argument. * @param L an OpenList of elements to be numbered * @param N the number starting from which the elements of L are numbered */ // rex: number([], N) => []; // rex: number([A | L], N) => [[A, N] | number(L, N+1)]; static OpenList number(OpenList L, int N) { return L.isEmpty() ? OpenList.nil : OpenList.cons(OpenList.list(L.first(), new Integer(N)), number(L.rest(), N+1)); } /** * parse parses the titles into OpenLists of words. * A word is defined to be consecutive non-blanks between one or more blanks. * @param Strings an OpenList of titles * @return an OpenList of OpenLists of words of the titles. */ // rex: parse(Strings) = map((title) => parse1(explode(title)), Strings); static OpenList parse(OpenList strings) { return strings.map(new Parse()); } /** * splitTitles produces, from a list of numbered titles, a list of triples: * words to the left of the keyword, words from the keyword on, and the * reference number. Triples where the keyword is noise word are not included. * * @param Noise the OpenList of noise words * @param Titles the OpenList of pairs consisting of title and a reference num * @return a list triples representing the split titles. */ // rex: splitTitles(Noise, []) => []; // rex: splitTitles(Noise, [[Title, Reference] | More]) => // rex: split(Noise, [], Title, Reference, splitTitles(Noise, More)); static OpenList splitTitles(OpenList Noise, OpenList Titles) { if( Titles.isEmpty() ) return OpenList.nil; OpenList pair = (OpenList)Titles.first(); OpenList title = (OpenList)pair.first(); Object reference = pair.second(); return split(Noise, OpenList.nil, title, reference, splitTitles(Noise, Titles.rest())); } /** * split is an auxiliary method used in splitting titles. It returns a list of * split titles. * @param Noise the list of noise words * @param Previous words (in reverse) before the keyword currently considered * @param Words the words consisting of the keyword and the words following it * @param Reference the reference number of this title * @param Tail a list of further split titles returned by a recursive call * of splitTitles * @return a list of split titles beginning with the particular split * determined by Previous and Words */ // rex: split(Noise, Previous, [], Reference, Tail) => Tail; // rex: // rex: split(Noise, Previous, [Word | FollowingWords], Ref, Tail) // rex: => member(Word, Noise) ? // rex: split(Noise, [Word | Previous], FollowingWords, Ref, Tail); // rex: // rex: split(Noise, Previous, [Word | FollowingWords], Ref, Tail) => // rex: // rex: [ [reverse(Previous), [Word | FollowingWords], Ref] // rex: | split(Noise, [Word | Previous], FollowingWords, Ref, Tail)]; static OpenList split(OpenList Noise, OpenList Previous, OpenList Words, Object Reference, OpenList Tail) { if( Words.isEmpty() ) return Tail; Object Word = Words.first(); OpenList Rest = Words.rest(); OpenList continuation = split(Noise, OpenList.cons(Word, Previous), Rest, Reference, Tail); return Noise.member(Word) ? continuation : OpenList.cons(OpenList.list(Previous.reverse(), OpenList.cons(Word, Rest), Reference), continuation); } /** * format produces a single string from a list of triples. The strings are * laid out according to the number of spaces specified in the arguments. * @param Left the number of spaces to be used for the left part of the title * @param Right the number of spaces to be used for the right part of the title * @param Triples an OpenList of triples containing the titles. Each triple * consists of a left part, a right part, and a reference number. * @return a single String containing the formatted kwic index */ // rex: format(Left, Right, Triples) = // rex: lconcat(map((Triple) => format1(Left, Right, Triple), Triples)); static String format(int Left, int Right, OpenList Triples) { StringBuffer buffer = new StringBuffer(); for( ; Triples.nonEmpty(); Triples = Triples.rest() ) { format1(Left, Right, (OpenList)Triples.first(), buffer); } return buffer.toString(); } /** * format1 is an auxiliary method for format that formats a single title. * Each call of format1 uses the same StringBuffer for deposit of results. * @param Left the number of spaces for the left part of the title * @param Right the number of spaces for the right part of the title * @param Triple an OpenList of three items: the left part of the title, the * right part of the title, and the reference number * @param buffer the StringBuffer into which the various pieces of a title * line are placed */ // rex: format1(Left, Right, [Before, After, Reference]) => // rex: concat(reverse(padString(Left, reverse(Before.lconcat(" ")))), " ", // rex: padString(Right, After.lconcat(" ")), ": ", // rex: make_string(Reference), "\n"); static void format1(int Left, int Right, OpenList Triple, StringBuffer buffer) { OpenList Before = (OpenList)Triple.first(); OpenList After = (OpenList)Triple.second(); Integer Reference = (Integer)Triple.third(); buffer.append(padBefore(Left, OpenList.explode(Before.lconcat(" "))).implode()); buffer.append(" "); buffer.append(padAfter(Right, OpenList.explode(After.lconcat(" "))).implode()); buffer.append(": "); buffer.append(Reference); buffer.append("\n"); } /** * padAfter(N, Chars) truncates or pads with blanks, as necessary, a list of * characters, so that the result is exactly N characters. Spaces are * added to the front of the list. If the list has more than N characters, * then the trailing characters are truncated. * @param N the number of spaces to be occupied by the final result * @param Chars the OpenList of Character's to be truncated or padded. * @return an OpenList of exactly N Character's */ // rex: padChars(0, Chars) => []; // rex: padChars(N, []) => [' ' | padChars(N-1, [])]; // rex: padChars(N, [Char | Chars]) => [Char | padChars(N-1, Chars)]; static OpenList padAfter(int N, OpenList chars) { if( N == 0 ) return OpenList.nil; if( chars.isEmpty() ) return OpenList.cons(blank, padAfter(N-1, OpenList.nil)); return OpenList.cons(chars.first(), padAfter(N-1, chars.rest())); } /** * padBefore(N, Chars) truncates or pads with blanks, as necessary, a list of * characters, so that the result is exactly N characters. Spaces are * added to the back of the list. If the list has more than N characters, * then the trailing characters are truncated. * @param N the number of spaces to be occupied by the final result * @param Chars the OpenList of Character's to be truncated or padded. * @return an OpenList of exactly N Character's */ static OpenList padBefore(int N, OpenList chars) { return padAfter(N, chars.reverse()).reverse(); } /** * getInteger tries to parse an integer value from a String. If the parse * is successful, the value is returned as an int. If not, an error message * is printed and the program is terminated. * @param s the String to be converted to an int * @return the int parsed from the String argument */ public static int getInteger(String s) { try { return Integer.parseInt(s); } catch( NumberFormatException e ) { errorExit("Invalid integer specified: " + s); } return 0; } /** * main is the main program. It reads up to three parameters from the * command-line: * * and reads the titles from the standard input, one per line. * It outputs a kwic index of the titles according to the specified layout. * If there are only two command-line parameters, the right number of spaces * is given a default value. * If there is only one, the left number of spaces is also given a default * value. * If there are no command-line parameters, then the noise words are given a * default value. */ public static void main(String arg[]) { OpenList noise = noiseDefault; // parameter values initialized to defaults int left = leftDefault; // These will be over-ridden if specified int right = rightDefault; // on the command line OpenList input = OpenList.nil; // The input as a list of lines. switch( arg.length ) { default: errorExit("At most three command-line parameters are allowed."); case 3: right = getInteger(arg[2]); if( right < 0 ) { errorExit("Numeric arguments cannot be negative: " + right); }; case 2: left = getInteger(arg[1]); if( left < 0 ) { errorExit("Numeric arguments cannot be negative: " + left); }; case 1: { String noisefilename = arg[0]; try { noise = OpenList.readLines( new BufferedReader( new InputStreamReader( new FileInputStream(noisefilename)))); } catch( FileNotFoundException e ) { errorExit("No such file for noise: " + noisefilename + "."); } catch( IOException e ) { errorExit("IOException reading noise file: " + e + "."); } } case 0: // nothing to do but use default values } try { input = OpenList.readLines( new BufferedReader(new InputStreamReader(System.in))); } catch( IOException e ) { errorExit("IOException on input: " + e + "."); } System.out.println(format(left, right, kwic(noise, input))); } /** * Print error message and exit. * @param message the content of the message to be printed. */ static void errorExit(String message) { System.err.println("*** " + message); System.exit(1); } }