• Longest Common Substring

    From porkchop@porkchop@invalid.foo (Mike Sanders) to comp.lang.awk on Fri Aug 23 04:39:59 2024
    From Newsgroup: comp.lang.awk

    # largest q-gram (longest common substring) in AWK: Michael Sanders - 2024
    # A q-gram is a contiguous sequence of characters of fixed length
    # see also: https://en.wikipedia.org/wiki/N-gram

    BEGIN {
    str1 = "MAWK"
    str2 = "gawk"
    q = largest_qgram(str1, str2)
    l = length(q)

    print str1 " : " str2

    if (length(q) > 0) {
    printf("largest q-gram (longest common substring): \"%s\" size: %d\n", q, l)
    } else {
    printf("no common substring found\n")
    }

    }

    # find largest q-gram (longest common substring) between 2 words
    function largest_qgram(str1, str2, i, j, maxlen, largest, tmp) {
    maxlen = 0
    largest = ""

    # for convenience's sake
    str1 = tolower(str1)
    str2 = tolower(str2)

    # loop over each character of 1st string
    for (i = 1; i <= length(str1); i++) {
    # loop over each character of 2nd string
    for (j = 1; j <= length(str2); j++) {
    tmp = ""
    # check for matching substrings at positions i and j
    while (substr(str1, i + length(tmp), 1) == substr(str2,
    j + length(tmp), 1) && i + length(tmp) <=
    length(str1) && j + length(tmp) <= length(str2)) {
    tmp = tmp substr(str1, i + length(tmp), 1)
    }

    # if matching substring is longer than previous maximum
    if (length(tmp) > maxlen) {
    maxlen = length(tmp)
    largest = tmp
    }
    }
    }
    return largest
    }

    # eof
    --
    :wq
    Mike Sanders

    --- Synchronet 3.20a-Linux NewsLink 1.114