#!/usr/bin/env python

# CYK Parsing algorithm that keeps track of all parse trees
# Note: The grammar must be in Chomsky Normal Form.
# Author: Robert Keller
# Disclaimer: This is my first Python program of substance, so go easy.
# Execute as 
#
#     python cyk.py

# An entry [i][j] in the parsing matrix is a set of pairs, where
# the first element is a non-terminal and the second element
# is a parse tree for for the portion of the input, call it x
#     x[i] x[i+1] . . . x[j]
# If the set is empty, there is no parse tree for that input segment.

# Component selectors for tuples (low-life data abstractions)
# Tuples are used to represent:
#    (a) productions (LHS, RHS1) or (LHS, RHS1, RHS2)
#    (b) individual non-terminal entries in the parse matrix
#    (c) parse trees
# (b) consists of a non-terminal and a parse tree (c).
# (The non-terminal is always the root of the tree, so is redundant.  
# However, it reflects that we first implemented
# the algorithm with the non-terminal, then added the tree.)

# tuple selectors for productions
LHS  = 0        # left-hand side of a production
RHS1 = 1        # first right-hand side symbol in a production
RHS2 = 2        # second right-hand side symbol in a production (if any)
LEN1 = 2        # length of a production with 1-symbol RHS
LEN2 = 3        # length of a production with 2-symbol RHS

# tuple selectors for parse array entries
SYM = 0         # symbol part of a parse array entry
TREE = 1        # tree part of a parse array entry

# tuple selectors for parse tree entries
ROOT = 0        # root of a tree
LEFT = 1        # left sub-tree of a tree
RIGHT = 2       # right sub-tree of a tree


def parse(input, productions):

    # Parse input according to productions

    print "\nParsing input: ", input

    n = len(input)

    # create n x n array of empty sets

    a = []
    for i in range(0, n):
        row = []   
        for j in range(0, n):
            row.append([])
        a.append(row)

    # initialize array main diagonal using productions with RHS length 1

    for i in range(0, n):
        for prod in productions:
            if len(prod) == LEN1 and prod[RHS1] == input[i]:
                a[i][i] = [(prod[LHS], (prod[LHS], input[i]))]

    # compute matrix entries for diagonals above the main diagonal

    for diagonal in range(1, n):
        col = diagonal
        row = 0
        while col < n:
            i = row
            j = row + 1
            while i < col:

                # compute a[row][col] by combining appropriate
                # earlier computed entries

                a[row][col].extend(cross(a[row][i], a[j][col], productions))
                i = i + 1
                j = j + 1
            col = col+1
            row = row+1

    # show results

    cornerEntry = a[0][n-1]
    numTrees = len(cornerEntry)

    if numTrees == 1:
        print "There is one parse tree:"
        showTrees(cornerEntry)
    elif numTrees > 0:
        print "There are", numTrees, "parse trees:"
        showTrees(cornerEntry)
    else:
        print "Input string is not generated the grammar."


def lhs(x, y, productions):

    # Return the list of left-hand sides for 
    # productions with right-hand side x y.

    result = []
    for prod in productions:
        # see if RHS of production matches x and y non-terminals

        if len(prod) == LEN2 and x[SYM] == prod[RHS1] and y[SYM] == prod[RHS2]:
            result.append((prod[LHS], (prod[LHS], x[TREE], y[TREE])))

    return result


def cross(S, T, productions):

    # Return the list of left-hand sides for 
    # productions with right-hand side non-terminals in 
    # lists S and T respectively

    result = []
    for x in S:
        for y in T:
            result.extend(lhs(x, y, productions))

    return result


def showTrees(entry):

    # Show parse trees for a given matrix entry.

    for i in range(0, len(entry)):
        print "--------------------"
        print "Tree ", i, ":"
        showTree(entry[i][1], "")


def showTree(tree, indentation):

    # Show a single parse tree.

    if type(tree) is tuple:
        # print non-leaf
        print indentation, tree[ROOT]
        for i in range(1, len(tree)):
            showTree(tree[i], indentation+"  ")
    else:
        # print leaf
        print indentation, tree


def showArray(a):

    # Show the parse array (used in debugging)

    print "--------------------"
    n = len(a)
    for i in range(0, n):
        for j in range(0, n):
            print a[i][j],
        print "\n"


def example1():

    # A balanced parentheses language:

    productions = [('S', 'L', 'R'), 
                   ('S', 'L', 'T'), 
                   ('T', 'S', 'R'),
                   ('S', 'S', 'S'),
                   ('L', '('),
                   ('R', ')')
                  ]

    good = ['(', '(', ')', '(', '(', ')', ')', ')']

    parse(good, productions)

    bad = ['(', '(', ')', '(', '(', '(', ')', ')']

    parse(bad, productions)


def example2():

    # A language from Kozen, exercise 92a.

    productions = [('S', 'S', 'T'), 
                   ('T', 'B', 'S'),
                   ('S', 'S', 'S'),
                   ('S', 'a'),
                   ('B', '+')
                  ]

    input1 = ['a', '+', 'a', '+', 'a', '+', 'a']

    parse(input1, productions)

    input2 = ['a', '+', 'a', '+', 'a', '+', 'a', '+', 'a']

    parse(input2, productions)
    

def main():

    # The main program

    example1()
    example2()


# Run the main program

main()