* This do file calculates the Damerau-Levenshtein distance between two strings * allowing for insertion, deletion, or substitution of individual characters and * transposition of adjacent letters. Each operation increases the distance by one unit. * The algorithm calculates the distance between `dlstring1' and `dlstring2'. * The output is contained in the generated variable `damlevdistance'. quietly { gen workstring1=trim(upper(dlstring1)) gen workstring2=trim(upper(dlstring2)) gen dmid=_n gen missingname=0 replace missingname=1 if workstring1=="" replace missingname=1 if workstring2=="" replace workstring1="UK" if workstring1=="" replace workstring2="UK" if workstring2=="" gen namelen1=length(workstring1) gen namelen2=length(workstring2) sum namelen1 local longname1=r(max) sum namelen2 local longname2=r(max) forvalues j=2(1)`longname1'{ replace workstring1=substr(workstring1,1,`j'-1)+substr(workstring1,`j'+1,length(workstring1)-`j') if regexm(substr(workstring1,`j',1),"([A-Z])")~=1 & substr(workstring1,`j',1)~="" } forvalues j=2(1)`longname2'{ replace workstring2=substr(workstring2,1,`j'-1)+substr(workstring2,`j'+1,length(workstring2)-`j') if regexm(substr(workstring2,`j',1),"([A-Z])")~=1 & substr(workstring2,`j',1)~="" } gen iletter="A" gen jletter="A" gen damlevdistance=0 local a=1 local b=1 local c=1 local f=1 local lastobs=_N gen matchcount=0 gen oldmatchcount=0 gen paddedworkstring1="A" gen paddedworkstring2="A" gen tempworkstring1=workstring1 gen tempworkstring2=workstring2 forvalues a=1(1)`lastobs'{ *if `dlmatch'==0 { replace oldmatchcount=0 if _n==`a' local alength=length(workstring1[`a']) local blength=length(workstring2[`a']) if `alength'>`blength'{ local deltalength=`alength'-`blength' forvalues c=0(1)`deltalength'{ replace tempworkstring2=substr("ZZZZZZZZZZZZZZZZZZZZZZZZZZZ",1,`deltalength'-`c')+workstring2+substr("ZZZZZZZZZZZZZZZZZZZZZZZZZZZ",1,`c') if _n==`a' replace matchcount=0 if _n==`a' forvalues f=1(1)`alength'{ replace matchcount=matchcount+1 if substr(tempworkstring2,`f',1)==substr(workstring1,`f',1) & _n==`a' } replace paddedworkstring2=tempworkstring2 if matchcount>=oldmatchcount & _n==`a' replace oldmatchcount=matchcount if _n==`a' & matchcount>=oldmatchcount } } if `blength'>`alength'{ local deltalength=`blength'-`alength' forvalues c=0(1)`deltalength'{ replace tempworkstring1=substr("ZZZZZZZZZZZZZZZZZZZZZZZZZZZ",1,`deltalength'-`c')+workstring1+substr("ZZZZZZZZZZZZZZZZZZZZZZZZZZZ",1,`c') if _n==`a' replace matchcount=0 if _n==`a' forvalues f=1(1)`blength'{ replace matchcount=matchcount+1 if substr(tempworkstring1,`f',1)==substr(workstring2,`f',1) & _n==`a' } replace paddedworkstring1=tempworkstring1 if matchcount>=oldmatchcount & _n==`a' replace oldmatchcount=matchcount if _n==`a' & matchcount>=oldmatchcount } } * replace workstring1=paddedworkstring1 if _n==`a' & `blength'>`alength' * replace workstring2=paddedworkstring2 if _n==`a' & `alength'>`blength' local alength=length(workstring1[`a']) local blength=length(workstring2[`a']) local totlength=`alength'+`blength' local aplus2=`alength'+2 local aplus1=`alength'+1 local bplus2=`blength'+2 local bplus1=`blength'+1 local letters="ABCDEFGHIJKLMNOPQRSTUVWXYZ" mata { H=J(`alength'+2,`blength'+2,0) for(b=1; b<=cols(H); b++){ H[1,b]=`totlength' } for(b=2; b`tempa'{ local hiplus1jplus1 = `hiplus1j'+1 } local tempb=`hijplus1'+1 if `hiplus1jplus1'>`tempb'{ local hiplus1jplus1 = `hijplus1'+1 } local tempc=`hi1j1'+`i'-`I1'-1+1+`j'-`J1'-1 if `hiplus1jplus1'>`tempc'{ local hiplus1jplus1 = `hi1j1'+`i'-`I1'-1+1+`j'-`J1'-1 } mata { H[`i'+2,`j'+2]=`hiplus1jplus1' } } mata { L[1,`ipos']=`i' } } replace damlevdistance=`hiplus1jplus1' if dmid==`a' *} } replace damlevdistance=. if missingname==1 drop workstring1 workstring2 dmid missingname namelen1 namelen2 iletter jletter paddedworkstring1 paddedworkstring2 tempworkstring1 tempworkstring2 matchcount oldmatchcount }