DELTA 86 0 2104
SVN  ~ąP" n » v- gw c#l#!/opt/local/bin/python

import re

def findFields(chunk_a,chunk_b):
    """Compare two similar chunks of HTML and return a list of fields -- ranges
    of plausible variance -- in their apparent underlying structure. Fields are
    represented as two-tuples (start,end) of character indices in chunk_a."""
    
    from difflib import SequenceMatcher
    
    class SpanState:
        """A SpanState detects and keeps track of a parser's state (either "inside"
        or "outside") with respect to some kind of character-delimited text range.
        If differences occur within a span, the entire span is marked as a field."""
        def __init__(self, start_delim, end_delim="", start_open=False):
            self.start_delim = start_delim
            if end_delim:
                self.end_delim = end_delim
            else:
                self.end_delim = start_delim
            self.inside = start_open
            if start_open:
                self.difference_inside = False
                self.opened_at = 0
        + '(?:' + self.start_delim + '[^' + self.end_delim + ']    + '[^' + self.end_delim + ']*$') # more junk
        def processContext(self, context, fields=None):
            search_start = 0 # When trying to open the span, start searching here.
            if self.inside: # try to close the span
                loc = context.find(self.end_delim)
                if loc != -1:
                    self.inside = False
                    if fields != None and self.difference_inside:
                        fields.append((self.opened_at, a_start+loc))
                    search_start = loc + 1 # opening must _after_ this closure
            if not self.inside: # try to open/reopen the span
                match = self.open_regex.match(context[search_start:])
                if match:
                    self.inside = True
                    self.difference_inside = False
                    self.opened_at = a_start + match.start(1) + search_start + 1
        def processDifference(self):
            self.difference_inside = True
    
    fields = []

    # difflib works on lists (not strings), so explode the chunks.
    list_a = [l for l in chunk_a]
    list_b = [l for l in chunk_b]
    
    # Walk through the results of the matcher, constructing initial fields.
    matcher = SequenceMatcher(None, list_a, list_b)
    quote_state = SpanState('"')          # fields are often inside "" pairs
    cdata_state = SpanState('>','<',True) # or outside any tags (plain CDATA)
    for op, a_start, a_end, b_start, b_end in matcher.get_opcodes():
        if op == "equal":
        quote_state.processContext(context,fields)
            cdata_state.processContext(context,fields)

        else: # difflib found some kind of difference
            # Don't bother establishing more fields where span analysis has us covered.
            if not (quote_state.inside or cdata_state.inside):
                fields.append((a_start,a_end))
            quote_state.processDifference()
            cdata_state.processDifference()

    return fields

def regexFromFields(chunk,fields):
    """Construct a pattern from chunk replacing its fields with non-greedy
    subpatterns like (.*?)."""
    
    pattern = ""
    
    lastFieldEnd = 0
    for field in fields:
        pattern += re.escape(chunk[lastFieldEnd:field[0]]) # non-field context
        pattern += "(.*?)"
        lastFieldEnd = field[1]
    pattern += re.escape(chunk[lastFieldEnd:])
    
    return pattern

if __name__ == '__main__':
    import sys
    a_start, a_end, b_start, b_end = [int(a) for a in sys.argv[1:5]]
    infile = sys.stdin.read()
    
    # generate a regular expression for the specified input chunks
    chunk_a, chunk_b = infile[a_start:a_end], infile[b_start:b_end]
    pattern = regexFromFields(chunk_a,findFields(chunk_a,chunk_b))
    
    # match the pattern against the whole page
    regex = re.compile(pattern)
    for match in regex.finditer(infile):
        print match.groups()ENDREP
id: 1f.0.r104/4063
type: file
pred: 1f.0.r93/347
count: 8
text: 104 0 4040 4432 0b49434a5faefcb8228a8473c7b4371e
props: 86 2117 30 0 4160c74de5f4e580dc15660c798ff9fc
cpath: /project3/individual_components/sampson/scraper.py
copyroot: 0 /

PLAIN
K 10
scraper.py
V 19
file 1f.0.r104/4063
K 4
test
V 18
dir 1g.0.r96/11735
END
ENDREP
id: 17.0.r104/4393
type: dir
pred: 17.0.r96/11985
count: 10
text: 104 4302 78 78 c2b9cdc00193b84ede37a09d0f3eca90
cpath: /project3/individual_components/sampson
copyroot: 0 /

PLAIN
K 16
lamotte-mitchell
V 18
dir 16.0.r95/14825
K 7
sampson
V 18
dir 17.0.r104/4393
K 8
striplin
V 18
dir 18.0.r102/7393
END
ENDREP
id: 15.0.r104/4705
type: dir
pred: 15.0.r102/7705
count: 19
text: 104 4569 123 123 182a5724454d35404d76aae8b2a4fd45
cpath: /project3/individual_components
copyroot: 0 /

PLAIN
K 20
architecture.graffle
V 19
file 10.0.r83/28052
K 16
architecture.tex
V 19
file 13.0.r98/49032
K 24
architecture_diagram.pdf
V 19
file 1a.0.r83/28269
K 19
component_specs.tex
V 19
file 14.0.r102/7875
K 9
index.txt
V 18
file 1s.0.r103/926
K 21
individual_components
V 18
dir 15.0.r104/4705
K 16
project_plan.tex
V 19
file 1c.0.r98/48593
K 22
project_plan_table.odt
V 18
file z.p.r98/49251
K 22
project_plan_table.pdf
V 19
file 1d.0.r98/48805
K 11
svninfo.sty
V 18
file 12.0.r49/5199
K 13
work_products
V 17
dir 1m.0.r97/1979
END
ENDREP
id: y.0.r104/5419
type: dir
pred: y.0.r103/1604
count: 62
text: 104 4875 531 531 851fed9488c2c0078ebe922d3bf42266
cpath: /project3
copyroot: 0 /

PLAIN
K 8
project2
V 16
dir x.0.r41/3433
K 8
project3
V 17
dir y.0.r104/5419
END
ENDREP
id: 0.0.r104/5653
type: dir
pred: 0.0.r103/1838
count: 104
text: 104 5565 75 75 9167eeb7efc18b8b2fdc233052b8e898
cpath: /
copyroot: 0 /

1f.0.t103-1 modify true false /project3/individual_components/sampson/scraper.py


5653 5790
