CHRIS RAE'S VBA PAGES

Routine Archive


Module:RegExpStuff
Description:Regular Expression parser


' Version 2 was using my matrix theory (which certainly
' seems to work, if I do say so myself). In version 3
' I have abandoned the "FirstInLine" and "LastInLine"
' concepts as they are completely redundant. Although
' it took me ages to work it out. CLR, 2/6/99 to 16/6/99.

' The table type - have to define this at module level because
' you can't define types anywhere else.
Private Type tyRegExpTable
    stCharMatch As String
    inMatchType As Integer
    boRepeat As Boolean
End Type

' The inMatchType constants
Private Const coNormal As Integer = 1
Private Const coAnyChar As Integer = 2
Private Const coOpposite As Integer = 3

' The RegExp table - I have to define this at module level too because
' I can't return arrays of user-defined types from functions.
Private REtable() As tyRegExpTable

' The comparison table. Defined at module level because I'm using it
' in lots of different procedures.
Private boCoTable() As Boolean
Public Sub MakeCoTable(ByVal stTest As String, ByVal stRegExp As String)
    ' Make up the comparison table. CLR, 3/6/99.

    ' The loop through the regexp string
    Dim inREloop As Integer
    ' And the one through the normal string
    Dim inSTloop As Integer

    ' Clear the table to its defaults. A few of
    ' the booleans aren't explicitly set to
    ' "false" but as they default to that when
    ' the variable is cleared, it doesn't matter
    ' all that much.
    ReDim boCoTable(Len(stTest), UBound(REtable))

    For inREloop = 1 To UBound(REtable)
        For inSTloop = 1 To Len(stTest)
            ' Test this match and put the result in the matrix.
            boCoTable(inSTloop, inREloop) = MatchChar(stTest, inSTloop, inREloop)
        Next inSTloop
    Next inREloop
End Sub
Private Function MatchChar(stToMatch As String, inPos As Integer, inREtableEntry As Integer) As Boolean
    ' Match the given character against the given entry
    ' in the RegExp table. Note that repeats don't matter
    ' here as they're dealt with at the path level. Note
    ' that we are relying on the fact that the function
    ' will default to false.
    ' We need to pass a string and position rather than
    ' a char because we need to be able to know whether
    ' it is the first or last character. CLR, 3/6/99.

    Dim chToMatch As String
    chToMatch = Mid(stToMatch, inPos, 1)
    
    Select Case REtable(inREtableEntry).inMatchType
        Case coNormal
            ' Nice and simple - just match it (remember to
            ' compare against the whole string in the
            ' regexp table)
            If InStr(REtable(inREtableEntry).stCharMatch, chToMatch) > 0 Then MatchChar = True
        Case coOpposite
            ' Don't match the regexp string.
            If InStr(REtable(inREtableEntry).stCharMatch, chToMatch) = 0 Then MatchChar = True
        Case coAnyChar
                MatchChar = True
        Case Else
            ' If the program broke, say so and give up
            Debug.Print "Unrecognised regexp type.";
            Stop
    End Select
End Function
Public Sub ShowCoTable(XaxisTitle As String)
    ' Display the comparison table. The X axis
    ' title is a purely cosmetic thing. This table
    ' now incorporates the RegExp table, which
    ' used to be displayed in a different procedure
    ' entirely. CLR, 3/6/99.
    
    Dim y As Integer, x As Integer

    Debug.Print
    Debug.Print "Comparison Table:"
    Debug.Print
    
    Debug.Print XaxisTitle
    For y = 1 To UBound(boCoTable, 2)
        For x = 1 To UBound(boCoTable, 1)
            Debug.Print IIf(boCoTable(x, y), "#", "-");
        Next x
        Debug.Print " " & REtable(y).stCharMatch & _
                           " (type=" & REtable(y).inMatchType & _
                           ", repeat=" & REtable(y).boRepeat & ")"
    Next y
End Sub
Private Function MultiMatch(ByVal stMulti As String) As String
    ' Match the multiple-character definitions like
    ' [a-zA-Z] or [abc] or something. CLR, 3/6/99.

    Dim stCompleteSet As String
    Dim inAtPos As Integer
    Dim inBuildString As Integer

    ' Work down the string, eating it steadily. ;-)
    Do
        ' Just ignore escaped minuses!
        If Left(stMulti, 1) = "\" Then
            ' Add the escaped character.
            stCompleteSet = stCompleteSet & Mid(stMulti, 2, 1)
            stMulti = Mid(stMulti, 3)
        ElseIf Mid(stMulti, 2, 1) = "-" Then
            ' It's a multiple set. Add the chunk...
            For inBuildString = Asc(Mid(stMulti, 1, 1)) To Asc(Mid(stMulti, 3, 1))
                stCompleteSet = stCompleteSet & Chr(inBuildString)
            Next inBuildString
            stMulti = Mid(stMulti, 4)
        Else
            ' It's not a multiple set and it's nothing
            ' escaped - just add this one character
            stCompleteSet = stCompleteSet & Left(stMulti, 1)
            stMulti = Mid(stMulti, 2)
        End If
    Loop Until Len(stMulti) = 0

    MultiMatch = stCompleteSet
End Function
Public Sub MakeREtable(ByVal stRegExp As String)
    ' Make up the regexp matching table. CLR, 2/6/99.

    ' The loop to make the RegExp expression table
    Dim inBuildTable As Integer

    ' The current character we're working on
    Dim chThisChar As String
    
    ' Has the next character been escaped - false
    ' by default at definition time and reset at
    ' the end of the main loop
    Dim boEscaped As Boolean

    ReDim REtable(1 To 1)

    ' Start off before first entry - this variable
    ' is incremented during the loop.
    inBuildTable = 0
    Do
        ' Chop the first character from stRegExp
        chThisChar = Left(stRegExp, 1)
        stRegExp = Mid(stRegExp, 2)

        ' Head onto the next expression chunk.
        inBuildTable = inBuildTable + 1
        ' Make sure the array is big enough to fit the
        ' new thing in.
        ReDim Preserve REtable(1 To inBuildTable)
        
        ' Check for special-case ones.

        ' Repeater?
        If chThisChar = "*" And Not boEscaped Then
            ' Oops! This refers to the last one so
            ' we need to drop the index back a bit.
            inBuildTable = inBuildTable - 1
            ReDim Preserve REtable(1 To inBuildTable)
            REtable(inBuildTable).boRepeat = True

        ' First in line or last in line?
        ElseIf chThisChar = "^" Or chThisChar = "$" And Not boEscaped Then
            ' Not doing a blind thing.
            inBuildTable = inBuildTable - 1
            ' Because the "^" may well be the very first character,
            ' have to be careful with this one!
            If inBuildTable > 0 Then
                ReDim Preserve REtable(1 To inBuildTable)
            End If

        ' Listed characters?
        ElseIf chThisChar = "[" And Not boEscaped Then
            ' Put the whole matching string in there - THERE
            ' IS NO ERROR CHECKING FOR THE END OF THE SQUARE
            ' BRACKETS!!!
            ' If it has the caret (^) at the beginning of it
            ' just negate it.
            If Left(stRegExp, 1) = "^" Then
                ' It does have the caret - no big problem. Just chop
                ' the caret off and set it as being a negative match.
                REtable(inBuildTable).stCharMatch = MultiMatch(Mid(stRegExp, 2, InStr(stRegExp, "]") - 2))
                REtable(inBuildTable).inMatchType = coOpposite
            Else
                ' It's a list but it's not negative.
                REtable(inBuildTable).stCharMatch = MultiMatch(Left(stRegExp, InStr(stRegExp, "]") - 1))
                REtable(inBuildTable).inMatchType = coNormal
            End If
            ' And cut the RegExp string.
            stRegExp = Mid(stRegExp, InStr(stRegExp, "]") + 1)
        
        ' Dot thing?
        ElseIf chThisChar = "." And Not boEscaped Then
            REtable(inBuildTable).stCharMatch = "N/A"
            REtable(inBuildTable).inMatchType = coAnyChar

        ' Escape char?
        ElseIf chThisChar = "\" And Not boEscaped Then
            ' Escape the next one
            boEscaped = True
            ' And don't store this one
            inBuildTable = inBuildTable - 1
        
        Else
            ' Not a special char of any sort - just treat it normally
            REtable(inBuildTable).stCharMatch = chThisChar
            REtable(inBuildTable).inMatchType = coNormal
            ' If this was a char that was escaped, don't do it
            ' again
            boEscaped = False
        End If ' (special char / not special char)

    Loop Until stRegExp = ""
End Sub
Public Function TracePath(x As Integer, y As Integer) As Boolean
    ' Trace the path through the matrix to see whether
    ' the expressions actually match. This routine
    ' is horrendously recursive. CLR, 3/6/99.
    
    ' If we went off the bottom corner of the matrix,
    ' we have completed the path. Note that if the last
    ' element of the matrix is a repeating one, we can go
    ' off the X-end of the matrix but still be on the last
    ' Y-value
    If x > UBound(boCoTable, 1) And (y > UBound(boCoTable, 2) Or (y = UBound(boCoTable, 2) And REtable(UBound(boCoTable, 2)).boRepeat)) Then
        TracePath = True
        ' If we went past the end to the right, or
        ' off the bottom (but not *both*) then it's bad.
    ElseIf x > UBound(boCoTable, 1) Or y > UBound(boCoTable, 2) Then
        TracePath = False
    Else
        ' Wierd repeat-skipping thing that may or may not work.
        ' Because "repeat" technically means *zero* or more, a
        ' repeat may jump a gap in the matrix. Easiest way to
        ' understand this one is to look at a matrix and see
        ' the jumping goin' on.
        If (Not boCoTable(x, y)) And REtable(y).boRepeat Then
            TracePath = TracePath(x, y + 1)
        ' Okay, we're not off the matrix in any shape or
        ' form. Other thing worth remembering here is that
        ' if this particular cell isn't true, there's not
        ' much point in checking any further, so...
        ElseIf boCoTable(x, y) Then
            TracePath = TracePath(x + 1, y + 1) Or _
                        IIf(REtable(y).boRepeat, TracePath(x + 1, y), False) _
                        Or IIf(REtable(y).boRepeat, TracePath(x, y + 1), False)
        End If
    End If
End Function
Public Function RegExp(stTest As String, stRegExp As String) As Boolean
    ' Test a string against a given Regular Expression. CLR, 2/6/99.

    ' Make up the RegExp reference table
    MakeREtable stRegExp
    
    ' Make up the comparison table
    MakeCoTable stTest, stRegExp
    ' Display the table (for debug)
    ShowCoTable stTest

    RegExp = TracePath(1, 1)
End Function

You're free to use these routines for anything you want - all I ask is that for commercial use you give me credit somewhere. You may instead want to head back to the index for my Visual Basic for Applications Pages or the main routines archive page.