// file: kwic.java
// author: Robert Keller
// purpose: Port kwic index application from rex to Java.
import java.io.*;
/**
* Class kwic defines three primary methods:
*
* - kwic creates a kwic index from a list of titles, in the form of an
* OpenList.
*
- format formats the result of kwic into a single String.
*
- main is an application that produces a kwic index using command-line
* arguments with the titles coming from the standard input.
*
*/
class kwic
{
/**
* A Character version of a blank space
*/
static private Character blank = new Character(' ');
/**
* The list of noise words if no filename is specifed on the command line.
*/
static public OpenList noiseDefault = OpenList.nil;
/**
* The number of spaces to be used for the left part of the title if not
* specified on the command line.
*/
static public int leftDefault = 35;
/**
* The number of spaces to be used for the right part of the title if not
* specified on the command line.
*/
static public int rightDefault = 35;
/**
* kwic creates a kwic index from a list of titles, in the form of an OpenList.
* Excluded are instances of titles that focus on a word in the list of noise
* words.
* @param Noise OpenList of noise words
* @param Titles OpenList of titles, each title being a single String.
* @return an OpenList kwic index. Each element of the index consists of
* a list of words before the keyword, a list of words including and following
* the keyword, and a reference number indicating the position of the title in
* the original list, starting with 0.
*/
// rex: kwic(Noise, Titles) =
// rex: sort(compare, splitTitles(Noise, number(parse(Titles))));
static OpenList kwic(OpenList Noise, OpenList Titles)
{
return splitTitles(Noise, number(parse(Titles))).sort(new TitleComparator());
}
/**
* number numbers the elements of any open list.
* @param L an OpenList of elements to be numbered.
* @return an OpenList of consisting of elements of L paired with successive
* numbers starting at 0. The pairings are themselves OpenLists, with the
* original element as the first and the number as the second. The numbers
* are in class Integer.
*/
// rex: number(List) = number(List, 0);
static OpenList number(OpenList L)
{
return number(L, 0);
}
/**
* auxiliary function for number of 1 argument.
* @param L an OpenList of elements to be numbered
* @param N the number starting from which the elements of L are numbered
*/
// rex: number([], N) => [];
// rex: number([A | L], N) => [[A, N] | number(L, N+1)];
static OpenList number(OpenList L, int N)
{
return L.isEmpty() ? OpenList.nil
: OpenList.cons(OpenList.list(L.first(), new Integer(N)),
number(L.rest(), N+1));
}
/**
* parse parses the titles into OpenLists of words.
* A word is defined to be consecutive non-blanks between one or more blanks.
* @param Strings an OpenList of titles
* @return an OpenList of OpenLists of words of the titles.
*/
// rex: parse(Strings) = map((title) => parse1(explode(title)), Strings);
static OpenList parse(OpenList strings)
{
return strings.map(new Parse());
}
/**
* splitTitles produces, from a list of numbered titles, a list of triples:
* words to the left of the keyword, words from the keyword on, and the
* reference number. Triples where the keyword is noise word are not included.
*
* @param Noise the OpenList of noise words
* @param Titles the OpenList of pairs consisting of title and a reference num
* @return a list triples representing the split titles.
*/
// rex: splitTitles(Noise, []) => [];
// rex: splitTitles(Noise, [[Title, Reference] | More]) =>
// rex: split(Noise, [], Title, Reference, splitTitles(Noise, More));
static OpenList splitTitles(OpenList Noise, OpenList Titles)
{
if( Titles.isEmpty() ) return OpenList.nil;
OpenList pair = (OpenList)Titles.first();
OpenList title = (OpenList)pair.first();
Object reference = pair.second();
return split(Noise,
OpenList.nil,
title,
reference,
splitTitles(Noise, Titles.rest()));
}
/**
* split is an auxiliary method used in splitting titles. It returns a list of
* split titles.
* @param Noise the list of noise words
* @param Previous words (in reverse) before the keyword currently considered
* @param Words the words consisting of the keyword and the words following it
* @param Reference the reference number of this title
* @param Tail a list of further split titles returned by a recursive call
* of splitTitles
* @return a list of split titles beginning with the particular split
* determined by Previous and Words
*/
// rex: split(Noise, Previous, [], Reference, Tail) => Tail;
// rex:
// rex: split(Noise, Previous, [Word | FollowingWords], Ref, Tail)
// rex: => member(Word, Noise) ?
// rex: split(Noise, [Word | Previous], FollowingWords, Ref, Tail);
// rex:
// rex: split(Noise, Previous, [Word | FollowingWords], Ref, Tail) =>
// rex:
// rex: [ [reverse(Previous), [Word | FollowingWords], Ref]
// rex: | split(Noise, [Word | Previous], FollowingWords, Ref, Tail)];
static OpenList split(OpenList Noise,
OpenList Previous,
OpenList Words,
Object Reference,
OpenList Tail)
{
if( Words.isEmpty() ) return Tail;
Object Word = Words.first();
OpenList Rest = Words.rest();
OpenList continuation = split(Noise,
OpenList.cons(Word, Previous),
Rest,
Reference,
Tail);
return Noise.member(Word)
? continuation
: OpenList.cons(OpenList.list(Previous.reverse(),
OpenList.cons(Word, Rest),
Reference),
continuation);
}
/**
* format produces a single string from a list of triples. The strings are
* laid out according to the number of spaces specified in the arguments.
* @param Left the number of spaces to be used for the left part of the title
* @param Right the number of spaces to be used for the right part of the title
* @param Triples an OpenList of triples containing the titles. Each triple
* consists of a left part, a right part, and a reference number.
* @return a single String containing the formatted kwic index
*/
// rex: format(Left, Right, Triples) =
// rex: lconcat(map((Triple) => format1(Left, Right, Triple), Triples));
static String format(int Left, int Right, OpenList Triples)
{
StringBuffer buffer = new StringBuffer();
for( ; Triples.nonEmpty(); Triples = Triples.rest() )
{
format1(Left, Right, (OpenList)Triples.first(), buffer);
}
return buffer.toString();
}
/**
* format1 is an auxiliary method for format that formats a single title.
* Each call of format1 uses the same StringBuffer for deposit of results.
* @param Left the number of spaces for the left part of the title
* @param Right the number of spaces for the right part of the title
* @param Triple an OpenList of three items: the left part of the title, the
* right part of the title, and the reference number
* @param buffer the StringBuffer into which the various pieces of a title
* line are placed
*/
// rex: format1(Left, Right, [Before, After, Reference]) =>
// rex: concat(reverse(padString(Left, reverse(Before.lconcat(" ")))), " ",
// rex: padString(Right, After.lconcat(" ")), ": ",
// rex: make_string(Reference), "\n");
static void format1(int Left, int Right, OpenList Triple, StringBuffer buffer)
{
OpenList Before = (OpenList)Triple.first();
OpenList After = (OpenList)Triple.second();
Integer Reference = (Integer)Triple.third();
buffer.append(padBefore(Left,
OpenList.explode(Before.lconcat(" "))).implode());
buffer.append(" ");
buffer.append(padAfter(Right,
OpenList.explode(After.lconcat(" "))).implode());
buffer.append(": ");
buffer.append(Reference);
buffer.append("\n");
}
/**
* padAfter(N, Chars) truncates or pads with blanks, as necessary, a list of
* characters, so that the result is exactly N characters. Spaces are
* added to the front of the list. If the list has more than N characters,
* then the trailing characters are truncated.
* @param N the number of spaces to be occupied by the final result
* @param Chars the OpenList of Character's to be truncated or padded.
* @return an OpenList of exactly N Character's
*/
// rex: padChars(0, Chars) => [];
// rex: padChars(N, []) => [' ' | padChars(N-1, [])];
// rex: padChars(N, [Char | Chars]) => [Char | padChars(N-1, Chars)];
static OpenList padAfter(int N, OpenList chars)
{
if( N == 0 ) return OpenList.nil;
if( chars.isEmpty() )
return OpenList.cons(blank, padAfter(N-1, OpenList.nil));
return OpenList.cons(chars.first(), padAfter(N-1, chars.rest()));
}
/**
* padBefore(N, Chars) truncates or pads with blanks, as necessary, a list of
* characters, so that the result is exactly N characters. Spaces are
* added to the back of the list. If the list has more than N characters,
* then the trailing characters are truncated.
* @param N the number of spaces to be occupied by the final result
* @param Chars the OpenList of Character's to be truncated or padded.
* @return an OpenList of exactly N Character's
*/
static OpenList padBefore(int N, OpenList chars)
{
return padAfter(N, chars.reverse()).reverse();
}
/**
* getInteger tries to parse an integer value from a String. If the parse
* is successful, the value is returned as an int. If not, an error message
* is printed and the program is terminated.
* @param s the String to be converted to an int
* @return the int parsed from the String argument
*/
public static int getInteger(String s)
{
try
{
return Integer.parseInt(s);
}
catch( NumberFormatException e )
{
errorExit("Invalid integer specified: " + s);
}
return 0;
}
/**
* main is the main program. It reads up to three parameters from the
* command-line:
*
* - the name of a file of noise words, one word per line
*
- the number of spaces to be used for the left part of the title
*
- the number of spaces to be used for the right part of the title
*
* and reads the titles from the standard input, one per line.
* It outputs a kwic index of the titles according to the specified layout.
* If there are only two command-line parameters, the right number of spaces
* is given a default value.
* If there is only one, the left number of spaces is also given a default
* value.
* If there are no command-line parameters, then the noise words are given a
* default value.
*/
public static void main(String arg[])
{
OpenList noise = noiseDefault; // parameter values initialized to defaults
int left = leftDefault; // These will be over-ridden if specified
int right = rightDefault; // on the command line
OpenList input = OpenList.nil; // The input as a list of lines.
switch( arg.length )
{
default: errorExit("At most three command-line parameters are allowed.");
case 3: right = getInteger(arg[2]);
if( right < 0 )
{
errorExit("Numeric arguments cannot be negative: " + right);
};
case 2: left = getInteger(arg[1]);
if( left < 0 )
{
errorExit("Numeric arguments cannot be negative: " + left);
};
case 1:
{
String noisefilename = arg[0];
try
{
noise = OpenList.readLines(
new BufferedReader(
new InputStreamReader(
new FileInputStream(noisefilename))));
}
catch( FileNotFoundException e )
{
errorExit("No such file for noise: " + noisefilename + ".");
}
catch( IOException e )
{
errorExit("IOException reading noise file: " + e + ".");
}
}
case 0: // nothing to do but use default values
}
try
{
input = OpenList.readLines(
new BufferedReader(new InputStreamReader(System.in)));
}
catch( IOException e )
{
errorExit("IOException on input: " + e + ".");
}
System.out.println(format(left, right, kwic(noise, input)));
}
/**
* Print error message and exit.
* @param message the content of the message to be printed.
*/
static void errorExit(String message)
{
System.err.println("*** " + message);
System.exit(1);
}
}