; A Scheme function that derives a DFA from a Regular Expression ; based on the derivatives method, originally describe in the paper ; Derivatives of Regular Expressions, Janusz A. Brzozowski, ; Journal of the ACM, Volume 11, Issue 4, 1964. ; Program author: Robert M. Keller ; Thanks to Stuart Pernsteiner for finding a bug in derivConct-helper (emp was originally lam). ; ; The syntax of regular expressions is here represented by S expressions, as follows: ; ; A single letter is represented by a symbol which is the letter itself. Below, ; the allowable alphabet is specified, although that aspect can be modified. ; ; The empty-set is represented by 'empty. ; ; The set consisting of the empty string is represented by 'lambda. ; ; The union of two (or more) sets is represented by (+ R S ...) where R and S ... are regular expressions. ; ; The concatenation of two (or more) sets is represented by (^ R S ...) where R and S ... are regular expressions. ; ; The star of a set is represented by (* R) where R is a regular expression. ; ; Examples, one per line: ; a ; b ; empty ; lambda ; (+ a b) ; (^ a b) ; (* a) ; (* (+ a b)) ; (+ (* a) (^ b c)) ; ; We don't provide much error checking, just to keep the exposition simple. ; This is the assumed alphabet for letters in the regular expressions: (define alphabet '(0 1 2 3 4 5 6 7 8 8 9 a b c d e f g h i j k l m n o p q r s t u v w x y z)) (define emp 'empty) ; Avoid magic strings in the code (define lam 'lambda) ; lambda is reserved, so can't be an identifier (define union-symbol '+) ; Define the three operator symbols (define concat-symbol '^) (define star-symbol '*) ; In the algorithm, the original regular expression and its derivative become the states of the DFA. ; The initial state is the original regular expression. ; Accepting states are those states for which lambda (the empty sequence) is an element of the ; corresponding regular expression. The key parts of the conversion are determining derivatives, ; which are simplified to keep the resulting state set small, checking whether the set represented by ; an expression contains lambda, the empty string, and controlling the whole process, which is done ; by creating new regular expressions depth-first from existing ones. Here we need to check whether ; an expression was already created, and avoid processing it again, to avoid infinite loops. (load "tester.scm") ; deriv computes the derivative of any regular expression regex ; with respect to a letter. (define (deriv letter regex) (cond ((isEmpty regex) emp) ((isLambda regex) emp) ((isLetter regex) (derivLetter letter regex)) ((isUnion regex) (derivUnion letter regex)) ((isConcat regex) (derivConcat letter regex)) ((isStar regex) (derivStar letter regex)) (else (((error "invalid regular expression " regex)))))) ; union forms the union of two sets represented as lists ; Neither list should have any duplicates, and the result is then guaranteed not to. (define (union x y) (if (null? x) y (if (member (first x) y) (union (rest x) y) (cons (first x) (union (rest x) y))))) ; union-all returns the union of a list of lists, provided that none of the ; inner lists has duplicates. (define (union-all L) (foldr union () L)) ; The following are discriminator functions used in dispatching the type of expression ; They are not completely thorough in checking the form of the expression. (define (isEmpty regex) (equal? regex emp)) (define (isLambda regex) (equal? regex lam)) (define (isLetter regex) (member regex alphabet)) (define (isUnion regex) (and (list? regex) (not (null? regex)) (equal? (first regex) union-symbol))) (define (isConcat regex) (and (list? regex) (not (null? regex)) (equal? (first regex) concat-symbol))) (define (isStar regex) (and (list? regex) (not (null? regex)) (equal? (first regex) star-symbol) (not (null? (rest regex))) (null? (rest (rest regex))))) ; The following extract derivatives for various cases ; Derivative of a single letter expression ; If letter is the same as regex, then the derivative is lambda. ; Otherwise the derivative is the empty set. (define (derivLetter letter regex) (if (equal? regex letter) lam emp)) ; Derivative of a union expression ; Map deriv over the arguments, then simplify. (define (derivUnion letter regex) (simplifyUnion (map (lambda(x) (deriv letter x)) (rest regex)))) ; simplifyUnion simplifies a union of multiple regular expressions ; The argument is a list of regular expressions of a union, but ; the union symbol itself is assumed not present. ; ; The union is first flattened, so that any sub-unions appear at ; the top level. ; Any empty sets in the union are dropped. ; Any duplicates are dropped. ; If there are any star sub-expressions, then lambdas are dropped. ; If any of the elements of the union is itself a union, then the ; elements of the inner union are moved to the outer level (define (simplifyUnion L) (let* ( (flat (flattenUnion L)) (clean (drop-instances emp flat)) (cleaner (drop-successive-equals (sort clean restring R) (symbol->string S))))) ; Flatten a union so that all sub-unions are at the same level (define (flattenUnion L) (if (null? L) () (if (isUnion (first L)) (flattenUnion (append (rest (first L)) (rest L))) (cons (first L) (flattenUnion (rest L)))))) (test (flattenUnion '(+ a (+ b c) (+ (+ d e) f))) '(+ a b c d e f)) ; Determine whether or not a list has any star sub-expressions. ; If it does, and this is a list of elements of a union, then ; any lambdas in the union can be eliminated. (define (any-stars L) (if (null? L) #f (if (isStar (first L)) #t (any-stars (rest L))))) (test (any-stars '(a b (* c))) #t) (test (any-stars '(a b c)) #f) ; Drop instances of an element from a list (define (drop-instances x L) (if (null? L) () (if (equal? x (first L)) (drop-instances x (rest L)) (cons (first L) (drop-instances x (rest L)))))) (test (drop-instances 'c '(a b c d c e f c g)) '(a b d e f g)) ; Drop any duplicates that occur one right after the other ; The first argument represents a dummy previous element that ; should be some element not in the list. (define (drop-successive-equals L) (if (null? L) () (cons (first L) (drop-successive-equals-helper (first L) (rest L))))) (define (drop-successive-equals-helper previous L) (if (null? L) () (if (equal? (first L) previous) (drop-successive-equals-helper previous (rest L)) (cons (first L) (drop-successive-equals-helper (first L) (rest L)))))) (test (drop-successive-equals '(a b b c d c e e e)) '(a b c d c e)) (test (derivUnion 1 '(+ 0 1)) lam) ; Derivative of a concatenation expression ; This is the most complex, because we have to distinguish ; between whether the first element "contains lambda" or not. (define (derivConcat letter regex) (derivConcat-helper letter (rest regex)) ) (define (derivConcat-helper letter subexps) (if (null? subexps) emp (let* ( (first-deriv (deriv letter (first subexps))) (partial-result (simplifyConcat (cons first-deriv (rest subexps)))) ) (if (contains-lambda (first subexps)) (simplifyUnion (list (derivConcat-helper letter (rest subexps)) partial-result )) partial-result)))) ; Simplify a concatenation by ; simplifyConcat simplifies a union of concatenation regular expressions ; The argument is a list of regular expressions of a concatenation, but ; the concatenation symbol itself is assumed not present. ; It works by: ; flattening out any sub-concatenations ; dropping any lambda's ; dropping any star sub-expressions that are equal (define (simplifyConcat L) (let ((flat (flattenConcat L))) (if (has-instance emp flat) emp (let* ( (clean (drop-instances lam flat)) (cleaned (drop-successive-equal-stars () clean)) ) (if (null? cleaned) lam (if (null? (rest cleaned)) (first cleaned) (cons concat-symbol cleaned))))))) ; Flatten a concatenation so that all sub-concatenations are at the same level (define (flattenConcat L) (if (null? L) () (if (isConcat (first L)) (flattenConcat (append (rest (first L)) (rest L))) (cons (first L) (flattenConcat (rest L)))))) (test (flattenConcat '(^ a (^ b c) (^ (^ d e) f))) '(^ a b c d e f)) ; Determine whether a list has an instance of a specified element (define (has-instance x L) (if (null? L) #f (if (equal? x (first L)) #t (has-instance x (rest L))))) ; Drop any consecutive star sub-expressions that are equal. (define (drop-successive-equal-stars previous L) (if (null? L) () (if (and (isStar (first L)) (equal? (first L) previous)) (drop-successive-equal-stars previous (rest L)) (cons (first L) (drop-successive-equal-stars (first L) (rest L)))))) ; Derivative of a star expression (define (derivStar letter regex) (simplifyConcat (map simplifyStar (list (deriv letter (second regex)) regex)))) ; simplifyStar replaces (* (* R)) with (* R), recursively (define (simplifyStar L) (if (isStar L) (if (isStar (second L)) (simplifyStar (second L)) L) L)) ; Determine by syntactic analysis whether or not the argument regular expression ; represents a set with lambda as an element (define (contains-lambda regex) (cond ((isEmpty regex) #f) ((isLambda regex) #t) ((isLetter regex) #f) ((isUnion regex) (foldr (lambda (x y) (or x y)) #f (map contains-lambda (rest regex)))) ((isConcat regex) (foldr (lambda (x y) (and x y)) #t (map contains-lambda (rest regex)))) ((isStar regex) #t) (else ((error "invalid regular expression " regex))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Make a DFA for a regular expression. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This uses the derivatives method. ; The states of the generated dfa are regular expressions. ; The initial state is the given regular expression. ; The accepting states are those containing lambda. ; There is a transition from state R to state S via letter whenever ; S is (deriv letter R) ; ; In order for this to function to terminate, simplifications of states need to be made ; in such a way that checking equivalence of the generated regular expressions can be ; done syntactically. Brzozowski's original paper proves this is possible. The simplifications ; of regular expressios described earlier are my attempt to do the necessary syntactic checks. ; I haven't proved that my implementation is complete, so it is possible that this procedure ; will fail to terminate for some regular expressions. For this reason, if absolute safety ; is required, it might be better to use a different method for creating the dfa, such as ; by creating an nfa and converting it to a dfa. It should be easier to establish the ; correctness of the alternate implementation. (define (re2dfa alphabet regex) (re2dfa-helper alphabet (list regex) () ())) ; Make a DFA by creating states and transitions for a backlog of regular expressions. ; accum is the accumulated list of states and transitions. Accumulation is done ; by stacking new rows atop old, so the final result has to be reversed, so that ; the initial state is first. accum is also used to check whether a state has been ; seen before, and if so, avoid reprocessing it. (define (re2dfa-helper alphabet backlog seen accum) (if (null? backlog) (reverse accum) ; backlog eliminated, return accumulation (let ((regex (first backlog))) (if (member regex seen) (re2dfa-helper alphabet (rest backlog) seen accum) ; already processed (let* ( ; not processed before (transitions (makeTransitions alphabet regex)) (newRow (makeRow regex transitions)) (moreStates (filter (lambda (state) (not (member state seen))) (map second transitions))) ) (re2dfa-helper ; process resulting backlog alphabet (append (rest backlog) moreStates) (cons regex seen) (cons newRow accum))))))) ; Make the transitions from a given state, a regular expression, ; by taking derivatives with respect to each letter in the alphabet. (define (makeTransitions alphabet regex) (if (null? alphabet) () (let* ( (letter (first alphabet)) (deriv1 (deriv letter regex)) ) (cons (list letter deriv1) (makeTransitions (rest alphabet) regex))))) ; Make a row of the state-transition table, adding an output value with the state ; and including the transitions. The output is 1 (accepting) if the set represented by ; the regular expression corresponding to the state has lambda as an element. (define (makeRow regex transitions) (let ((output (if (contains-lambda regex) 1 0))) (cons regex (cons output transitions)))) ; Prettify the DFA by replacing regular expression states with successive numbers (define (prettify dfa) (rename-from dfa -1 ())) ; Rename the states of a dfa to be numbers, starting with 1 greater than the given number. (define (rename-from dfa number alist) (if (null? dfa) () (let* ((row (first dfa)) (new-alist-and-number (add-to-alist number alist (cons (first row) (map second (rest (rest row)))))) ; augment symbol table (new-number (first new-alist-and-number)) (new-alist (second new-alist-and-number)) (new-row ; reconstruct row using augmented table (cons (second (assoc (first row) new-alist)) (cons (second row) (map (lambda(x) (list (first x) (second (assoc (second x) new-alist)))) (rest (rest row)))))) ) (cons new-row (rename-from (rest dfa) new-number new-alist))))) ; Add a new association (between a state and a number) to an association list. (define (add-to-alist number alist states) (if (null? states) (list number alist) (let* ( (state (first states)) (found (assoc state alist))) (if found (add-to-alist number alist (rest states)) (let* ( (new-number (+ 1 number)) (new-alist (cons (list state new-number) alist)) ) (add-to-alist new-number new-alist (rest states))))))) ; Get the row of a dfa, which contains the output and transitions, given the state name. (define (get-row state-name dfa) (let ((found (assoc state-name dfa))) (if found found (error "named state not found in machine: " state-name dfa)))) ; Remove all duplicates from a list of regular expressions (define (remove-duplicates L) (define (remove-duplicates-helper L previous) (if (null? L) () (if (equal? (first L) previous) (remove-duplicates-helper (rest L) previous) (cons (first L) (remove-duplicates-helper (rest L) (first L)))))) (remove-duplicates-helper (sort L re