// file: a02.rex // author: Robert Keller // purpose: Keyword-in-Context (kwic) function implementation with formatter // // description: Two functions are implemented: // Function kwic takes a list of titles and a list of noise // words, returning a kwic index in the form of a list of // triples: // words before the keyword, // words after the keyword, and // reference number // Reference numbers are assigned starting with 0 for the // first title. // // Function format takes the number of spaces for the left // and right titles and a list of triples, as returned by // kwic, and returns a single string representing the titles // laid out in the specified number of spaces. // Function kwic is described above. kwic(Noise, Titles) = sort(compare, splitTitles(Noise, number(parse(Titles)))); // kwic first parses all of the titles, then attaches a reference number. // It then splits the titles into left and right lists of words and sorts // by keyword. // number numbers the elements of any list, starting with 0. The number // elements are returned as a pair [element, index] number(List) = number(List, 0); // number(L, N) numbers the elements of list L starting with N number([], N) => []; number([A | L], N) => [[A, N] | number(L, N+1)]; // parse(Strings) parses each string of a list of strings. // Each string is parsed into a list of the words. A word is defined to // be successive characters, not including blanks, but separated by at // least one blank. // Punctuation following a word without intervening space is considered to // be part of the word. parse(Strings) = map((word) => parse1(explode(word)), Strings); // parse1 parses a list of characters. It ignores any initial blanks. // Once a non-blank is found, it turns over parsing to parse3. parse1([]) => []; parse1([' ' | Chars]) => parse1(Chars); parse1([NonBlank | Chars]) => parse2([NonBlank], Chars); // parse2(Acc, Chars) starts with a non-blank character in its accumulator and // accumulates the chars of a word in reverse, up until the next blank or // to the end of the list Chars, at which time it re-creates the word and // calls parse1. parse2(Acc, []) => [reverse(implode(Acc))]; parse2(Acc, [' ' | Chars]) => [reverse(implode(Acc)) | parse1(Chars)]; parse2(Acc, [Char | Chars]) => parse2([Char | Acc], Chars); // splitTitles(Noise, NumberedTitles) splits each title (with reference) // into two, using all non-Noise words as breakpoints. // The first rule handles the case of no titles. splitTitles(Noise, []) => []; // The second rule appends the results of splitting the first title // to the result of splitting the rest of the titles. // The append is implicit in that the fifth argument to splits is the // tail of the list, eventually to be returned in the basis. // splitTitles(Noise, [[Title, Reference] | More]) => split(Noise, [], Title, Reference, splitTitles(Noise, More)); // split(Noise, [], FollowingWords, Reference, Tail) produces a list of the // split titles, followed by Tail. // The second argument is the accumulation of words before the keyword. // It is kept in reverse order for ease in adding the next word. // The fifth argument is the Tail of the ultimate list, use to avoid // an explicit append step. // The first rule handles the basis: no titles to split: split(Noise, PreviousWordsReversed, [], Reference, Tail) => Tail; // The second rule handles the case where the word in the first position is // noise. It just returns the result of splits on the rest of the words, // but the Word is still accumulated with the previous words in the title. split(Noise, PreviousWordsReversed, [Word | FollowingWords], Ref, Tail) => member(Word, Noise) ? split(Noise, [Word | PreviousWordsReversed], FollowingWords, Ref, Tail); // The third rule handles the case where the word in the first position is // not noise. It creates a list with the words in the title, followed by // the result of splitting the remainder of the title. split(Noise, PreviousWordsReversed, [Word | FollowingWords], Ref, Tail) => [ [reverse(PreviousWordsReversed), [Word | FollowingWords], Ref] | split(Noise, [Word | PreviousWordsReversed], FollowingWords, Ref, Tail)]; // format produces a single string from a list of triples: // [words before keyword, words after keyword, reference number] format(Left, Right, Triples) = lconcat(map((Triple) => format1(Left, Right, Triple), Triples)); // format1 formats one title, padding the before and after portions with the // appropriate number of blanks, if any. format1(Left, Right, [Before, After, Reference]) => concat(reverse(padString(Left, reverse(lconcat(Before, " ")))), " ", padString(Right, lconcat(After, " ")), ": ", make_string(Reference), "\n"); // Returns a string of exactly N characters, by padding String on the right. padString(N, String) = implode(padChars(N, explode(String))); // padChars(N, Chars) truncates or pads with blanks, as necessary, a list of // characters, so that the result is exactly N characters. Spaces are // added to the front of the list. If the list has more than N characters, // then the trailing characters are truncated. padChars(0, Chars) => []; padChars(N, []) => [' ' | padChars(N-1, [])]; padChars(N, [Char | Chars]) => [Char | padChars(N-1, Chars)]; // sort(compare, L) sorts a list of items using compare as the comparison // operator. sort(compare, L) = mergeall(compare, map((X) => [X], L)); // Bottom-up merge sort is used. First a list of singleton lists is made // of the items in the original list. Then mergeall is used to repeatedly // merge lists into longer lists, until there is only one list left, and // that list is returned. // mergeall merges lists in L pair-wise, resulting in a new list of // fewer but longer lists. It then calls itself. // When there is only one list of lists left, the inner list is returned. mergeall(compare, []) => []; mergeall(compare, [L]) => L; mergeall(compare, L) => mergeall(compare, merge1(compare, L)); // merge1 merges individual pairs in the list of lists, by calling the // builtin-in merge. merge1(compare, []) => []; merge1(compare, [A]) => [A]; merge1(compare, [A, B | L]) => [merge(compare, A, B) | merge1(compare, L)]; // compare compares two triples [Before, After, Reference] // to see which should come first. compare([F1, S1, N1], [F2, S2, N2]) => compareWords(S1, S2); // compareWords compares the words in two lists. // If the first list is empty then it is considered <= the second. compareWords([], _) => 1; // If the first list is non-empty, but the second is empty, then the // the first is considered > the second. compareWords(_, []) => 0; // If neither list is empty, then lower case version of the words are // compared. If the result is <, then the first list is <= the second. // If the result is ==, then the rests of the lists are compared recursively. // Otherwise, the result is >. compareWords([A | X], [B | Y]) => AL = lower(A), BL = lower(B), AL < BL ? 1 : AL == BL ? compareWords(X, Y) : 0; // lower(String) converts the elements in String to all lower-case lower(String) = implode(map(lower1, explode(String))); // lower maps lower1 across the characters in the string, then implodes // the result. lower1('A') => 'a'; lower1('B') => 'b'; lower1('C') => 'c'; lower1('D') => 'd'; lower1('E') => 'e'; lower1('F') => 'f'; lower1('G') => 'g'; lower1('H') => 'h'; lower1('I') => 'i'; lower1('J') => 'j'; lower1('K') => 'k'; lower1('L') => 'l'; lower1('M') => 'm'; lower1('N') => 'n'; lower1('O') => 'o'; lower1('P') => 'p'; lower1('Q') => 'q'; lower1('R') => 'r'; lower1('S') => 's'; lower1('T') => 't'; lower1('U') => 'u'; lower1('V') => 'v'; lower1('W') => 'w'; lower1('X') => 'x'; lower1('Y') => 'y'; lower1('Z') => 'z'; lower1(X) => X;