#!/usr/bin/env python # CYK Parsing algorithm that keeps track of all parse trees # Note: The grammar must be in Chomsky Normal Form. # Author: Robert Keller # Disclaimer: This is my first Python program of substance, so go easy. # Execute as # # python cyk.py # An entry [i][j] in the parsing matrix is a set of pairs, where # the first element is a non-terminal and the second element # is a parse tree for for the portion of the input, call it x # x[i] x[i+1] . . . x[j] # If the set is empty, there is no parse tree for that input segment. # Component selectors for tuples (low-life data abstractions) # Tuples are used to represent: # (a) productions (LHS, RHS1) or (LHS, RHS1, RHS2) # (b) individual non-terminal entries in the parse matrix # (c) parse trees # (b) consists of a non-terminal and a parse tree (c). # (The non-terminal is always the root of the tree, so is redundant. # However, it reflects that we first implemented # the algorithm with the non-terminal, then added the tree.) # tuple selectors for productions LHS = 0 # left-hand side of a production RHS1 = 1 # first right-hand side symbol in a production RHS2 = 2 # second right-hand side symbol in a production (if any) LEN1 = 2 # length of a production with 1-symbol RHS LEN2 = 3 # length of a production with 2-symbol RHS # tuple selectors for parse array entries SYM = 0 # symbol part of a parse array entry TREE = 1 # tree part of a parse array entry # tuple selectors for parse tree entries ROOT = 0 # root of a tree LEFT = 1 # left sub-tree of a tree RIGHT = 2 # right sub-tree of a tree def parse(input, productions): # Parse input according to productions print "\nParsing input: ", input n = len(input) # create n x n array of empty sets a = [] for i in range(0, n): row = [] for j in range(0, n): row.append([]) a.append(row) # initialize array main diagonal using productions with RHS length 1 for i in range(0, n): for prod in productions: if len(prod) == LEN1 and prod[RHS1] == input[i]: a[i][i] = [(prod[LHS], (prod[LHS], input[i]))] # compute matrix entries for diagonals above the main diagonal for diagonal in range(1, n): col = diagonal row = 0 while col < n: i = row j = row + 1 while i < col: # compute a[row][col] by combining appropriate # earlier computed entries a[row][col].extend(cross(a[row][i], a[j][col], productions)) i = i + 1 j = j + 1 col = col+1 row = row+1 # show results cornerEntry = a[0][n-1] numTrees = len(cornerEntry) if numTrees == 1: print "There is one parse tree:" showTrees(cornerEntry) elif numTrees > 0: print "There are", numTrees, "parse trees:" showTrees(cornerEntry) else: print "Input string is not generated the grammar." def lhs(x, y, productions): # Return the list of left-hand sides for # productions with right-hand side x y. result = [] for prod in productions: # see if RHS of production matches x and y non-terminals if len(prod) == LEN2 and x[SYM] == prod[RHS1] and y[SYM] == prod[RHS2]: result.append((prod[LHS], (prod[LHS], x[TREE], y[TREE]))) return result def cross(S, T, productions): # Return the list of left-hand sides for # productions with right-hand side non-terminals in # lists S and T respectively result = [] for x in S: for y in T: result.extend(lhs(x, y, productions)) return result def showTrees(entry): # Show parse trees for a given matrix entry. for i in range(0, len(entry)): print "--------------------" print "Tree ", i, ":" showTree(entry[i][1], "") def showTree(tree, indentation): # Show a single parse tree. if type(tree) is tuple: # print non-leaf print indentation, tree[ROOT] for i in range(1, len(tree)): showTree(tree[i], indentation+" ") else: # print leaf print indentation, tree def showArray(a): # Show the parse array (used in debugging) print "--------------------" n = len(a) for i in range(0, n): for j in range(0, n): print a[i][j], print "\n" def example1(): # A balanced parentheses language: productions = [('S', 'L', 'R'), ('S', 'L', 'T'), ('T', 'S', 'R'), ('S', 'S', 'S'), ('L', '('), ('R', ')') ] good = ['(', '(', ')', '(', '(', ')', ')', ')'] parse(good, productions) bad = ['(', '(', ')', '(', '(', '(', ')', ')'] parse(bad, productions) def example2(): # A language from Kozen, exercise 92a. productions = [('S', 'S', 'T'), ('T', 'B', 'S'), ('S', 'S', 'S'), ('S', 'a'), ('B', '+') ] input1 = ['a', '+', 'a', '+', 'a', '+', 'a'] parse(input1, productions) input2 = ['a', '+', 'a', '+', 'a', '+', 'a', '+', 'a'] parse(input2, productions) def main(): # The main program example1() example2() # Run the main program main()