// file:    a02.rex
// author:  Robert Keller
// purpose: Keyword-in-Context (kwic) function implementation with formatter
//          
// description: Two functions are implemented: 
//                  Function kwic takes a list of titles and a list of noise 
//                  words, returning a kwic index in the form of a list of 
//                  triples:
//                      words before the keyword, 
//                      words after the keyword, and 
//                      reference number
//                  Reference numbers are assigned starting with 0 for the
//                  first title.
//        
//                  Function format takes the number of spaces for the left
//                  and right titles and a list of triples, as returned by 
//                  kwic, and returns a single string representing the titles
//                  laid out in the specified number of spaces.

// Function kwic is described above.

kwic(Noise, Titles) = sort(compare, splitTitles(Noise, number(parse(Titles))));

// kwic first parses all of the titles, then attaches a reference number.
// It then splits the titles into left and right lists of words and sorts
// by keyword.


// number numbers the elements of any list, starting with 0.  The number
// elements are returned as a pair [element, index]

number(List) = number(List, 0);


// number(L, N) numbers the elements of list L starting with N

number([], N) => [];

number([A | L], N) => [[A, N] | number(L, N+1)];


// parse(Strings) parses each string of a list of strings.  
// Each string is parsed into a list of the words.  A word is defined to
// be successive characters, not including blanks, but separated by at
// least one blank.
// Punctuation following a word without intervening space is considered to
// be part of the word.

parse(Strings) = map((word) => parse1(explode(word)), Strings);


// parse1 parses a list of characters. It ignores any initial blanks.  
// Once a non-blank is found, it turns over parsing to parse3.

parse1([]) => [];

parse1([' ' | Chars]) => parse1(Chars);

parse1([NonBlank | Chars]) => parse2([NonBlank], Chars);


// parse2(Acc, Chars) starts with a non-blank character in its accumulator and
// accumulates the chars of a word in reverse, up until the next blank or
// to the end of the list Chars, at which time it re-creates the word and 
// calls parse1.

parse2(Acc, []) => [reverse(implode(Acc))];

parse2(Acc, [' ' | Chars]) => [reverse(implode(Acc)) | parse1(Chars)];

parse2(Acc, [Char | Chars]) => parse2([Char | Acc], Chars);


// splitTitles(Noise, NumberedTitles) splits each title (with reference) 
// into two, using all non-Noise words as breakpoints.

// The first rule handles the case of no titles.

splitTitles(Noise, []) => [];

// The second rule appends the results of splitting the first title
// to the result of splitting the rest of the titles.  
// The append is implicit in that the fifth argument to splits is the
// tail of the list, eventually to be returned in the basis.
// 

splitTitles(Noise, [[Title, Reference] | More]) =>

    split(Noise, [], Title, Reference, splitTitles(Noise, More));


// split(Noise, [], FollowingWords, Reference, Tail) produces a list of the 
// split titles, followed by Tail.
// The second argument is the accumulation of words before the keyword.  
// It is kept in reverse order for ease in adding the next word.
// The fifth argument is the Tail of the ultimate list, use to avoid
// an explicit append step.

// The first rule handles the basis: no titles to split:

split(Noise, PreviousWordsReversed, [], Reference, Tail) => Tail;

// The second rule handles the case where the word in the first position is
// noise.  It just returns the result of splits on the rest of the words,
// but the Word is still accumulated with the previous words in the title.

split(Noise, PreviousWordsReversed, [Word | FollowingWords], Ref, Tail) => 

    member(Word, Noise) ? 

      split(Noise, [Word | PreviousWordsReversed], FollowingWords, Ref, Tail);

// The third rule handles the case where the word in the first position is
// not noise.  It creates a list with the words in the title, followed by
// the result of splitting the remainder of the title.

split(Noise, PreviousWordsReversed,  [Word | FollowingWords], Ref, Tail) => 

  [ [reverse(PreviousWordsReversed), [Word | FollowingWords], Ref] 

    | split(Noise, [Word | PreviousWordsReversed], FollowingWords, Ref, Tail)];


// format produces a single string from a list of triples: 
//     [words before keyword, words after keyword, reference number]

format(Left, Right, Triples) =
    lconcat(map((Triple) => format1(Left, Right, Triple), Triples));


// format1 formats one title, padding the before and after portions with the
// appropriate number of blanks, if any.

format1(Left, Right, [Before, After, Reference]) =>
    concat(reverse(padString(Left,  reverse(lconcat(Before, " ")))), " ",
                   padString(Right,         lconcat(After,  " ")),   ": ",
           make_string(Reference),                                   "\n");


// Returns a string of exactly N characters, by padding String on the right.

padString(N, String) = implode(padChars(N, explode(String)));


// padChars(N, Chars) truncates or pads with blanks, as necessary, a list of 
// characters, so that the result is exactly N characters.  Spaces are
// added to the front of the list.  If the list has more than N characters,
// then the trailing characters are truncated.

padChars(0, Chars) => [];

padChars(N, []) => [' ' | padChars(N-1, [])];

padChars(N, [Char | Chars]) => [Char | padChars(N-1, Chars)];


// sort(compare, L) sorts a list of items using compare as the comparison
// operator.

sort(compare, L) = mergeall(compare, map((X) => [X], L));

// Bottom-up merge sort is used.  First a list of singleton lists is made
// of the items in the original list.  Then mergeall is used to repeatedly
// merge lists into longer lists, until there is only one list left, and
// that list is returned.


// mergeall merges lists in L pair-wise, resulting in a new list of
// fewer but longer lists.  It then calls itself.
// When there is only one list of lists left, the inner list is returned.

mergeall(compare, []) => [];

mergeall(compare, [L]) => L;

mergeall(compare, L) => mergeall(compare, merge1(compare, L));


// merge1 merges individual pairs in the list of lists, by calling the
// builtin-in merge.

merge1(compare, []) => [];

merge1(compare, [A]) => [A];

merge1(compare, [A, B | L]) => [merge(compare, A, B) | merge1(compare, L)];


// compare compares two triples [Before, After, Reference] 
// to see which should come first.

compare([F1, S1, N1], [F2, S2, N2]) => compareWords(S1, S2);

// compareWords compares the words in two lists.

// If the first list is empty then it is considered <= the second.

compareWords([], _) => 1;

// If the first list is non-empty, but the second is empty, then the
// the first is considered > the second.

compareWords(_, []) => 0;

// If neither list is empty, then lower case version of the words are
// compared.  If the result is <, then the first list is <= the second.
// If the result is ==, then the rests of the lists are compared recursively.
// Otherwise, the result is >.

compareWords([A | X], [B | Y]) => 
    AL = lower(A),
    BL = lower(B),
    AL < BL ? 1 
  : AL == BL ? compareWords(X, Y)
  : 0;


// lower(String) converts the elements in String to all lower-case

lower(String) = implode(map(lower1, explode(String)));

// lower maps lower1 across the characters in the string, then implodes
// the result.

lower1('A') => 'a';
lower1('B') => 'b';
lower1('C') => 'c';
lower1('D') => 'd';
lower1('E') => 'e';
lower1('F') => 'f';
lower1('G') => 'g';
lower1('H') => 'h';
lower1('I') => 'i';
lower1('J') => 'j';
lower1('K') => 'k';
lower1('L') => 'l';
lower1('M') => 'm';
lower1('N') => 'n';
lower1('O') => 'o';
lower1('P') => 'p';
lower1('Q') => 'q';
lower1('R') => 'r';
lower1('S') => 's';
lower1('T') => 't';
lower1('U') => 'u';
lower1('V') => 'v';
lower1('W') => 'w';
lower1('X') => 'x';
lower1('Y') => 'y';
lower1('Z') => 'z';
lower1(X)   => X;