From Newsgroup: comp.lang.awk
# largest q-gram (longest common substring) in AWK: Michael Sanders - 2024
# A q-gram is a contiguous sequence of characters of fixed length
# see also:
https://en.wikipedia.org/wiki/N-gram
BEGIN {
str1 = "MAWK"
str2 = "gawk"
q = largest_qgram(str1, str2)
l = length(q)
print str1 " : " str2
if (length(q) > 0) {
printf("largest q-gram (longest common substring): \"%s\" size: %d\n", q, l)
} else {
printf("no common substring found\n")
}
}
# find largest q-gram (longest common substring) between 2 words
function largest_qgram(str1, str2, i, j, maxlen, largest, tmp) {
maxlen = 0
largest = ""
# for convenience's sake
str1 = tolower(str1)
str2 = tolower(str2)
# loop over each character of 1st string
for (i = 1; i <= length(str1); i++) {
# loop over each character of 2nd string
for (j = 1; j <= length(str2); j++) {
tmp = ""
# check for matching substrings at positions i and j
while (substr(str1, i + length(tmp), 1) == substr(str2,
j + length(tmp), 1) && i + length(tmp) <=
length(str1) && j + length(tmp) <= length(str2)) {
tmp = tmp substr(str1, i + length(tmp), 1)
}
# if matching substring is longer than previous maximum
if (length(tmp) > maxlen) {
maxlen = length(tmp)
largest = tmp
}
}
}
return largest
}
# eof
--
:wq
Mike Sanders
--- Synchronet 3.20a-Linux NewsLink 1.114