Re: string comparator by Matt
Matt
Fri Feb 02 08:02:35 CST 2007
On Feb 2, 1:42 pm, Helmut Weber <nbhymsjxd...@mailinator.com> wrote:
> Hi Matt,
>
> >Do you have an idea...
>
> yes.
>
> It seems that a tool to calculate similarity
> between strings is very much asked for nowadays.
> There was a question in the german groups about it.
>
> IMHO, I think, my solution is in principle better
> than the "levenshtein distance".
> Somewhat to google for.
>
> If you want to know it all, the code below
> will give you a correlation coefficient between two strings.
>
> Don't be afraid, some things are complicated,
> and there is no easy to understand solution.
>
> Disregard the comments in german.
> To comment it all, it would take me a week.
>
> For
> str1 =3D "alberto parreira , alberto juan fernandez parreira"
> str2 =3D "alberto-juan parreira , alberto juuan parreira."
> I get
> Correlation(Character) =3D 0.82
> Correlation(Substring) =3D 0.54
> Correlation(combined ) =3D 0.72
>
> Whether this is sufficient for you, I don't know.
>
> Just have a go and good luck,
> and beware of line breaks by the newsreader.
>
> Option Explicit
> Sub Correlation()
> Dim str1 As String
> Dim str2 As String
> Dim CorChrc As Single ' correlation by character
> Dim CorStrn As Single ' correlation by string
>
> str1 =3D "alberto parreira , alberto juan fernandez parreira"
> str2 =3D "alberto-juan parreira , alberto juuan parreira."
> CorChrc =3D FncCorChr(str1, str2)
> CorStrn =3D FncCorStr(str1, str2)
> Debug.Print "Correlation(Character) =3D " & Format(CorChrc, " 0.00")
> Debug.Print "Correlation(Substring) =3D " & Format(CorStrn, " 0.00")
> Debug.Print "Correlation(combined ) =3D " & Format((CorChrc * 2 +
> CorStrn) /
> 3, " 0.00")
> End Sub
>
> Public Function FncCorChr(str1$, str2$) As Single
> ' Correlation by set of characters
> ' =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D
> ' Union =3D Anzahl(1) + Anzahl(2)
> ' Relation =3D kleinerer Wert durch gr=F6=DFerer Wert
> ' Durchschnitt =3D Relation /2
> ' gewichteter Durchschnitt =3D Durchschnitt * Union
> '
> ' Word Correlation WrdCor
> ' NumCom =3D Summe aller Vergleiche
> ' WrdCor =3D Summe aller ZeichenCorrelationen
> ' WrdCor =3D WrdCor / (NumCom/2) 'Durchschnitt der Vergleiche
>
> Dim l As Long
>
> Dim ChrNum As Long
> Dim ComNum As Long ' number of comparisons
> Dim ChrCor As Single ' character correlation
> Dim WrdCor As Single
> Dim Union As Long
>
> Dim ArChr01(32 To 255) As Long
> Dim ArChr02(32 To 255) As Long
>
> For l =3D 32 To 255 ' clear arrays
> ArChr01(l) =3D 0
> ArChr02(l) =3D 0
> Next
> For l =3D 1 To Len(str1) ' count frequency
> ChrNum =3D Asc(Mid$(str1, l, 1))
> ArChr01(ChrNum) =3D ArChr01(ChrNum) + 1
> Next
> For l =3D 1 To Len(str2)
> ChrNum =3D Asc(Mid$(str2, l, 1))
> ArChr02(ChrNum) =3D ArChr02(ChrNum) + 1
> Next
>
> ComNum =3D 0
> WrdCor =3D 0
>
> For l =3D 32 To 255
> Union =3D ArChr01(l) + ArChr02(l)
> If Union =3D 0 Then GoTo fertig ' Don't process
> If ArChr01(l) =3D 0 Or ArChr02(l) =3D 0 Then ' zero anyway
> ChrCor =3D 0
> GoTo weiter
> End If
> If ArChr01(l) =3D ArChr02(l) Then ' short cut
> ChrCor =3D Union / 2
> GoTo weiter
> End If
> If ArChr01(l) <> ArChr02(l) Then
> If ArChr01(l) > ArChr02(l) Then
> ChrCor =3D ArChr02(l) / ArChr01(l)
> ChrCor =3D ChrCor / 2
> ChrCor =3D ChrCor * Union
> End If
> If ArChr01(l) < ArChr02(l) Then
> ChrCor =3D ArChr01(l) / ArChr02(l)
> ChrCor =3D ChrCor / 2
> ChrCor =3D ChrCor * Union
> End If
> End If
> weiter:
> WrdCor =3D WrdCor + ChrCor
> fertig:
> Next l
> ComNum =3D 0
>
> For l =3D 32 To 255
> ComNum =3D ComNum + ArChr01(l) + ArChr02(l)
> Next
>
> FncCorChr =3D WrdCor / (ComNum / 2)
>
> End Function
>
> Public Function FncCorStr(LongStr$, ShrtStr$) As Single
> ' get substrings longer than minimum length
> ' get number of all strings
> ' get number of common strings
> ' calculate relation of common strings to all strings
>
> Dim ShrtLen As Long
> Dim LongLen As Long
>
> Dim f As Boolean ' found
> Dim IsInComm As Boolean ' substring is in common
> Dim l As Long
> Dim m As Long
> Dim n As Long
> Dim p As Long ' position
>
> Dim s0 As String
> Dim S1 As String
>
> Dim ShrtLoc As String
> Dim LongLoc As String
> Dim TempLoc As String
> Dim shrtMin As Long
> Dim ShrtSum As Long ' 1 + 2 + n for long
> Dim LongSum As Long ' 1 + 2 + n for long
> Dim HalfArr As Long ' half way of array
> Dim HalfStp As Long
> Dim ComLSum As Long ' sum of length of common
>
> Dim ShrtStrItm() As String ' substrings short
> Dim LongStrItm() As String ' substrings long
> Dim CommStrItm() As String ' common strings
> Dim ShrtStrFrq() As Long ' frequency short
> Dim LongStrFrq() As Long ' frequency long
>
> ShrtLoc =3D ShrtStr ' local value
> LongLoc =3D LongStr ' local value
>
> ShrtSum =3D 0
> LongSum =3D 0
> LongLoc =3D LCase(LongLoc)
> ShrtLoc =3D LCase(ShrtLoc)
>
> If Len(ShrtLoc) > Len(LongLoc) Then
> TempLoc =3D LongLoc
> LongLoc =3D ShrtLoc
> ShrtLoc =3D TempLoc
> End If
>
> LongLen =3D Len(LongLoc)
> ShrtLen =3D Len(ShrtLoc)
>
> shrtMin =3D 2 ' CLng(TxSubMin.Text)
> ' k=FCrzester zu untersuchender Substring
> ' ------------------------------------- number of substrings
> ' --------------------------------------------- Summenformel
> For l =3D 1 To ShrtLen - (shrtMin - 1)
> ShrtSum =3D ShrtSum + l
> Next
> ' ---------------------------- redim array for short strings
> ReDim ShrtStrItm(ShrtSum)
> ReDim ShrtStrFrq(ShrtSum)
>
> For l =3D 1 To LongLen - (shrtMin - 1)
> LongSum =3D LongSum + l
> Next
> ' ----------------------------- redim array for long strings
> ReDim LongStrItm(LongSum)
> ReDim LongStrFrq(LongSum)
>
> '___________________________________________________________
> ' ---------------------- add subs of shorter string to array
> n =3D 0
> For l =3D 1 To ShrtLen - (shrtMin - 1) ' 1 2
> p =3D 0 ' 5
> For m =3D 1 To l
> n =3D n + 1
> p =3D p + 1
> ShrtStrItm(n) =3D Mid(ShrtLoc, p, ShrtLen - l + 1)
> Next
> Next
> ' ----------------------- add subs of longer string to array
> n =3D 0
> For l =3D 1 To LongLen - (shrtMin - 1) ' 1 2
> p =3D 0 ' 5
> For m =3D 1 To l
> n =3D n + 1
> p =3D p + 1
> LongStrItm(n) =3D Mid(LongLoc, p, LongLen - l + 1)
> Next
> Next
> ' ----------------------------------- Count freqencies short
> ' -------------------------- get index of first short string
> ' ------------------------- equal half length of long string
>
> HalfStp =3D 0
> HalfArr =3D CLng((ShrtLen) / 2)
> For l =3D 1 To HalfArr
> HalfStp =3D HalfStp + l
> Next
> For l =3D 1 To HalfStp
> ShrtStrFrq(l) =3D 1
> Next
> For l =3D HalfStp + 1 To ShrtSum
> ShrtStrFrq(l) =3D FncStrCnt(ShrtLoc, ShrtStrItm(l))
> Next
>
> ' --------------------------------- remove double from array
>
> For l =3D 1 To ShrtSum
> For m =3D l + 1 To ShrtSum
> If ShrtStrItm(l) =3D ShrtStrItm(m) Then
> For n =3D m To ShrtSum - 1
> ShrtStrItm(n) =3D ShrtStrItm(n + 1) ' verschieben
> ShrtStrFrq(n) =3D ShrtStrFrq(n + 1)
> Next n
> ShrtSum =3D ShrtSum - 1
> ReDim Preserve ShrtStrItm(ShrtSum)
> ReDim Preserve ShrtStrFrq(ShrtSum)
> Exit For
> End If
> Next m
> Next l
>
> ' ---------------- end of collecting data for shorter string
> '___________________________________________________________
>
> ' ----------------------------------- first common substring
> ' --------------------------- beware of no common substrings
>
> f =3D False
> For l =3D 1 To ShrtSum
> If InStr(LongLoc, ShrtStrItm(l)) > 0 Then
> f =3D True
> ReDim CommStrItm(1)
> CommStrItm(1) =3D ShrtStrItm(l)
> Exit For
> End If
> Next
> If f =3D False Then
> FncCorStr =3D 0
> Exit Function
> End If
>
> n =3D 1
> '--------------------------------- further common substrings
> S1 =3D CommStrItm(1)
> For m =3D l + 1 To ShrtSum ' ab gefunden weitersuchen
> s0 =3D ShrtStrItm(m)
> If (InStr(LongLoc, s0) > 0) Then
> IsInComm =3D False
> For p =3D 1 To n
> If InStr(CommStrItm(p), s0) > 0 Then
> IsInComm =3D True
> Exit For
> End If
> Next
> If Not IsInComm Then
> n =3D n + 1
> ReDim Preserve CommStrItm(n)
> CommStrItm(n) =3D s0
> End If
> End If
> Next
>
> ComLSum =3D 0
> For l =3D 1 To n
> ComLSum =3D ComLSum + Len(CommStrItm(l))
> Next
>
> If ComLSum > LongLen Then
> FncCorStr =3D LongLen / ComLSum
> Else
> FncCorStr =3D ComLSum / LongLen
> End If
>
> End Function
>
> Public Function FncStrCnt(Lng$, Shr$) As Long
> ' ----------------------------------- count string in string
> Dim l As Long ' position
> Dim m As Long ' counter
> l =3D 1
> m =3D 0
> While InStr(l, Lng, Shr) > 0
> m =3D m + 1
> l =3D InStr(l, Lng, Shr) + 1
> Wend
> FncStrCnt =3D m
>
> End Function
>
> --
> Greetings from Bavaria, Germany
>
> Helmut Weber, MVP WordVBA
>
> Win XP, Office 2003
> "red.sys" & Chr$(64) & "t-online.de"
Thank you very much Helmut
I think this solution will be great for me
just to see, i will try to compare it with the levenshtein distance
Cheers