levenshtein.gno
4.83 Kb ยท 181 lines
1// from www.youtube.com/watch?v=d-Eq6x1yssU
2// Levenshtein distance is an algorithm to detect how close 2 words are.
3// For example, "Float" and "Boats" have a levenstein distance of 3.
4// Why ? Because you need 3 steps to go from Boats to Float
5// - We start with the word (Boats)
6// - Remove the s (Boat)
7// - Insert the l (Bloat)
8// - Substitute b with f (Float)
9package levenshtein
10
11import (
12 "strconv"
13
14 "gno.land/p/nt/avl"
15)
16
17// Distance computes the Levenshtein distance between two strings.
18//
19// The Levenshtein distance is a measure of the similarity between two strings,
20// defined as the minimum number of single-character edits (insertions,
21// deletions, or substitutions) required to change one string into the other.
22//
23// For example, the Levenshtein distance between "Float" and "Boats" is 3:
24// - Remove 's' from "Boats" โ "Boat"
25// - Insert 'l' โ "Bloat"
26// - Substitute 'b' with 'f' โ "Float"
27//
28// This implementation uses a memory-efficient dynamic programming approach
29// that computes the distance in O(len(a) * len(b)) time and O(min(len(a), len(b))) space.
30func Distance(a, b string) uint {
31 // make sure a is the longest
32 if len(a) < len(b) {
33 a, b = b, a
34 }
35
36 // init first row
37 line := make([]uint, len(b)+1)
38 for j := range line {
39 line[j] = uint(j)
40 }
41
42 // main loop
43 for i := 1; i <= len(a); i++ {
44 prev := line[0]
45 line[0] = uint(i)
46 for j := 1; j <= len(b); j++ {
47 old := line[j]
48 if a[i-1] == b[j-1] {
49 line[j] = prev
50 } else {
51 line[j] = min3(
52 line[j-1]+1, // insertion
53 line[j]+1, // deletion
54 prev+1, // substitution
55 )
56 }
57 prev = old
58 }
59 }
60 return line[len(b)]
61}
62
63// Levenshteinable is an interface for types that can be compared using
64// Levenshtein distance. It provides the LString method, which returns
65// the string representation used for comparison.
66//
67// Note: This differs from the standard String() method, which may be used
68// for display or debugging.
69type Levenshteinable interface {
70 LString() string
71}
72
73// PickFromString returns the Levenshteinable object from the slice xs
74// that is closest in Levenshtein distance to the input string xStr.
75//
76// It returns the best match along with its distance. If xs is empty,
77// it returns (nil, 0).
78//
79// This function runs in O(n) time, where n is the length of the slice.
80func PickFromString(xStr string, xs []Levenshteinable) (Levenshteinable, uint) {
81 // check for empty list
82 if len(xs) == 0 {
83 return nil, 0
84 }
85
86 // initialize
87 bestDistance := Distance(xStr, xs[0].LString())
88 best := xs[0]
89
90 // loop through objects
91 for n := 1; n < len(xs); n++ {
92 dist := Distance(xStr, xs[n].LString())
93 if dist < bestDistance {
94 bestDistance = dist
95 best = xs[n]
96 }
97 }
98 return best, bestDistance
99}
100
101// Pick returns the Levenshteinable object from the slice xs that is
102// closest in Levenshtein distance to the input object x.
103//
104// This is a convenience wrapper around PickFromString(x.LString(), xs).
105// It runs in O(n) time.
106func Pick(x Levenshteinable, xs []Levenshteinable) (Levenshteinable, uint) {
107 return PickFromString(x.LString(), xs)
108}
109
110func padLeftNum(n uint, width int) string {
111 s := strconv.FormatUint(uint64(n), 10)
112 for len(s) < width {
113 s = "0" + s
114 }
115 return s
116}
117
118// SeveralPickFromString returns the n closest Levenshteinable objects
119// to the input string xStr from the slice xs, using Levenshtein distance.
120//
121// If xs is empty, it returns nil. If n is 0, it returns an empty slice.
122//
123// The result is not guaranteed to be sorted by distance.
124// This function runs in O(n) time using an AVL tree for efficient selection.
125func SeveralPickFromString(xStr string, xs []Levenshteinable, n uint) []Levenshteinable {
126 // error casing
127 if len(xs) == 0 {
128 return nil
129 }
130 if n == 0 {
131 return []Levenshteinable{}
132 }
133
134 tree := avl.NewTree()
135 maxKey := ""
136
137 for k, v := range xs {
138 // calculate distance and create unique key
139 d := Distance(xStr, v.LString())
140 // ufmt.Sprintf("%016u:%d", d, k)
141 key := padLeftNum(d, 6) + ":" + strconv.Itoa(k) // no sprintf ( ;-;)
142 tree.Set(key, v)
143
144 // remove the element with the greater key
145 if uint(tree.Size()) > n {
146 tree.ReverseIterate("", "", func(k string, _ any) bool {
147 maxKey = k
148 return true
149 })
150 tree.Remove(maxKey)
151 }
152 }
153
154 // create the list
155 out := make([]Levenshteinable, 0, tree.Size())
156 tree.Iterate("", "", func(k string, v any) bool {
157 out = append(out, v.(Levenshteinable))
158 return false
159 })
160 return out
161}
162
163// SeveralPick returns the n closest Levenshteinable objects to the
164// input object x from the slice xs, using Levenshtein distance.
165//
166// This is a convenience wrapper around SeveralPickFromString(x.LString(), xs, n).
167// It runs in O(n) time.
168func SeveralPick(x Levenshteinable, xs []Levenshteinable, n uint) []Levenshteinable {
169 return SeveralPickFromString(x.LString(), xs, n)
170}
171
172func min3(a, b, c uint) uint {
173 min := a
174 if b < min {
175 min = b
176 }
177 if c < min {
178 min = c
179 }
180 return min
181}