Search Apps Documentation Source Content File Folder Download Copy Actions Download

levenshtein.gno

4.83 Kb ยท 181 lines
  1// from www.youtube.com/watch?v=d-Eq6x1yssU
  2// Levenshtein distance is an algorithm to detect how close 2 words are.
  3// For example, "Float" and "Boats" have a levenstein distance of 3.
  4// Why ? Because you need 3 steps to go from Boats to Float
  5// - We start with the word (Boats)
  6// - Remove the s (Boat)
  7// - Insert the l (Bloat)
  8// - Substitute b with f (Float)
  9package levenshtein
 10
 11import (
 12	"strconv"
 13
 14	"gno.land/p/nt/avl"
 15)
 16
 17// Distance computes the Levenshtein distance between two strings.
 18//
 19// The Levenshtein distance is a measure of the similarity between two strings,
 20// defined as the minimum number of single-character edits (insertions,
 21// deletions, or substitutions) required to change one string into the other.
 22//
 23// For example, the Levenshtein distance between "Float" and "Boats" is 3:
 24//   - Remove 's' from "Boats" โ†’ "Boat"
 25//   - Insert 'l' โ†’ "Bloat"
 26//   - Substitute 'b' with 'f' โ†’ "Float"
 27//
 28// This implementation uses a memory-efficient dynamic programming approach
 29// that computes the distance in O(len(a) * len(b)) time and O(min(len(a), len(b))) space.
 30func Distance(a, b string) uint {
 31	// make sure a is the longest
 32	if len(a) < len(b) {
 33		a, b = b, a
 34	}
 35
 36	// init first row
 37	line := make([]uint, len(b)+1)
 38	for j := range line {
 39		line[j] = uint(j)
 40	}
 41
 42	// main loop
 43	for i := 1; i <= len(a); i++ {
 44		prev := line[0]
 45		line[0] = uint(i)
 46		for j := 1; j <= len(b); j++ {
 47			old := line[j]
 48			if a[i-1] == b[j-1] {
 49				line[j] = prev
 50			} else {
 51				line[j] = min3(
 52					line[j-1]+1, // insertion
 53					line[j]+1,   // deletion
 54					prev+1,      // substitution
 55				)
 56			}
 57			prev = old
 58		}
 59	}
 60	return line[len(b)]
 61}
 62
 63// Levenshteinable is an interface for types that can be compared using
 64// Levenshtein distance. It provides the LString method, which returns
 65// the string representation used for comparison.
 66//
 67// Note: This differs from the standard String() method, which may be used
 68// for display or debugging.
 69type Levenshteinable interface {
 70	LString() string
 71}
 72
 73// PickFromString returns the Levenshteinable object from the slice xs
 74// that is closest in Levenshtein distance to the input string xStr.
 75//
 76// It returns the best match along with its distance. If xs is empty,
 77// it returns (nil, 0).
 78//
 79// This function runs in O(n) time, where n is the length of the slice.
 80func PickFromString(xStr string, xs []Levenshteinable) (Levenshteinable, uint) {
 81	// check for empty list
 82	if len(xs) == 0 {
 83		return nil, 0
 84	}
 85
 86	// initialize
 87	bestDistance := Distance(xStr, xs[0].LString())
 88	best := xs[0]
 89
 90	// loop through objects
 91	for n := 1; n < len(xs); n++ {
 92		dist := Distance(xStr, xs[n].LString())
 93		if dist < bestDistance {
 94			bestDistance = dist
 95			best = xs[n]
 96		}
 97	}
 98	return best, bestDistance
 99}
100
101// Pick returns the Levenshteinable object from the slice xs that is
102// closest in Levenshtein distance to the input object x.
103//
104// This is a convenience wrapper around PickFromString(x.LString(), xs).
105// It runs in O(n) time.
106func Pick(x Levenshteinable, xs []Levenshteinable) (Levenshteinable, uint) {
107	return PickFromString(x.LString(), xs)
108}
109
110func padLeftNum(n uint, width int) string {
111	s := strconv.FormatUint(uint64(n), 10)
112	for len(s) < width {
113		s = "0" + s
114	}
115	return s
116}
117
118// SeveralPickFromString returns the n closest Levenshteinable objects
119// to the input string xStr from the slice xs, using Levenshtein distance.
120//
121// If xs is empty, it returns nil. If n is 0, it returns an empty slice.
122//
123// The result is not guaranteed to be sorted by distance.
124// This function runs in O(n) time using an AVL tree for efficient selection.
125func SeveralPickFromString(xStr string, xs []Levenshteinable, n uint) []Levenshteinable {
126	// error casing
127	if len(xs) == 0 {
128		return nil
129	}
130	if n == 0 {
131		return []Levenshteinable{}
132	}
133
134	tree := avl.NewTree()
135	maxKey := ""
136
137	for k, v := range xs {
138		// calculate distance and create unique key
139		d := Distance(xStr, v.LString())
140		// ufmt.Sprintf("%016u:%d", d, k)
141		key := padLeftNum(d, 6) + ":" + strconv.Itoa(k) // no sprintf ( ;-;)
142		tree.Set(key, v)
143
144		// remove the element with the greater key
145		if uint(tree.Size()) > n {
146			tree.ReverseIterate("", "", func(k string, _ any) bool {
147				maxKey = k
148				return true
149			})
150			tree.Remove(maxKey)
151		}
152	}
153
154	// create the list
155	out := make([]Levenshteinable, 0, tree.Size())
156	tree.Iterate("", "", func(k string, v any) bool {
157		out = append(out, v.(Levenshteinable))
158		return false
159	})
160	return out
161}
162
163// SeveralPick returns the n closest Levenshteinable objects to the
164// input object x from the slice xs, using Levenshtein distance.
165//
166// This is a convenience wrapper around SeveralPickFromString(x.LString(), xs, n).
167// It runs in O(n) time.
168func SeveralPick(x Levenshteinable, xs []Levenshteinable, n uint) []Levenshteinable {
169	return SeveralPickFromString(x.LString(), xs, n)
170}
171
172func min3(a, b, c uint) uint {
173	min := a
174	if b < min {
175		min = b
176	}
177	if c < min {
178		min = c
179	}
180	return min
181}