| package levenshtein |
| |
| import ( |
| "fmt" |
| "os" |
| ) |
| |
| type EditOperation int |
| const ( |
| ins = iota |
| del |
| sub |
| match |
| ) |
| |
| type EditScript []EditOperation |
| |
| type MatchFunction func(rune, rune) bool |
| |
| type Options struct { |
| InsCost int |
| DelCost int |
| SubCost int |
| Matches MatchFunction |
| } |
| |
| // DefaultOptions is the default options: insertion cost is 1, deletion cost is |
| // 1, substitution cost is 2, and two runes match iff they are the same. |
| var DefaultOptions Options = Options { |
| InsCost: 1, |
| DelCost: 1, |
| SubCost: 2, |
| Matches: func (sourceCharacter rune, targetCharacter rune) bool { |
| return sourceCharacter == targetCharacter |
| }, |
| } |
| |
| func (operation EditOperation) String() string { |
| if operation == match { |
| return "match" |
| } else if operation == ins { |
| return "ins" |
| } else if operation == sub { |
| return "sub" |
| } |
| return "del" |
| } |
| |
| // DistanceForStrings returns the edit distance between source and target. |
| func DistanceForStrings(source []rune, target []rune, op Options) int { |
| matrix := MatrixForStrings(source, target, op) |
| return matrix[len(target)][len(source)] |
| } |
| |
| // DistanceForMatrix reads the edit distance off the given Levenshtein matrix. |
| func DistanceForMatrix(matrix [][]int) int { |
| return matrix[len(matrix[0]) - 1][len(matrix) - 1] |
| } |
| |
| // MatrixForStrings generates a 2-D array representing the dynamic programming |
| // table used by the Levenshtein algorithm, as described e.g. here: |
| // http://www.let.rug.nl/kleiweg/lev/ |
| // The reason for putting the creation of the table into a separate function is |
| // that is cannot only be used for reading of the edit distance between two |
| // strings, but also e.g. to backtrace an edit script that provides an |
| // alignment between the characters of both strings. |
| func MatrixForStrings(source []rune, target []rune, op Options) [][]int { |
| // Make a 2-D matrix. Rows correspond to prefixes of source, columns to |
| // prefixes of target. Cells will contain edit distances. |
| // Cf. http://www.let.rug.nl/~kleiweg/lev/levenshtein.html |
| height := len(source) + 1 |
| width := len(target) + 1 |
| matrix := make([][]int, height) |
| |
| // Initialize trivial distances (from/to empty string). That is, fill |
| // the left column and the top row with row/column indices. |
| for i := 0; i < height; i++ { |
| matrix[i] = make([]int, width) |
| matrix[i][0] = i |
| } |
| for j := 1; j < width; j++ { |
| matrix[0][j] = j |
| } |
| |
| // Fill in the remaining cells: for each prefix pair, choose the |
| // (edit history, operation) pair with the lowest cost. |
| for i := 1; i < height; i++ { |
| for j := 1; j < width; j++ { |
| delCost := matrix[i - 1][j] + op.DelCost |
| matchSubCost := matrix[i - 1][j - 1] |
| if !op.Matches(source[i - 1], target[j - 1]) { |
| matchSubCost += op.SubCost |
| } |
| insCost := matrix[i][j - 1] + op.InsCost |
| matrix[i][j] = min(delCost, min(matchSubCost, |
| insCost)) |
| } |
| } |
| //LogMatrix(source, target, matrix) |
| return matrix |
| } |
| |
| // EditScriptForStrings returns an optimal edit script to turn source into |
| // target. |
| func EditScriptForStrings(source []rune, target []rune, op Options) EditScript { |
| return backtrace(len(source), len(target), |
| MatrixForStrings(source, target, op), op) |
| } |
| |
| // EditScriptForMatrix returns an optimal edit script based on the given |
| // Levenshtein matrix. |
| func EditScriptForMatrix(matrix [][]int, op Options) EditScript { |
| return backtrace(len(matrix[0]) - 1, len(matrix) - 1, matrix, op) |
| } |
| |
| // LogMatrix outputs a visual representation of the given matrix for the given |
| // strings on os.Stderr. |
| func LogMatrix(source []rune, target []rune, matrix [][]int) { |
| fmt.Fprintf(os.Stderr, " ") |
| for _, targetRune := range target { |
| fmt.Fprintf(os.Stderr, " %c", targetRune) |
| } |
| fmt.Fprintf(os.Stderr, "\n") |
| fmt.Fprintf(os.Stderr, " %2d", matrix[0][0]) |
| for j, _ := range target { |
| fmt.Fprintf(os.Stderr, " %2d", matrix[0][j + 1]) |
| } |
| fmt.Fprintf(os.Stderr, "\n") |
| for i, sourceRune := range source { |
| fmt.Fprintf(os.Stderr, "%c %2d", sourceRune, matrix[i + 1][0]) |
| for j, _ := range target { |
| fmt.Fprintf(os.Stderr, " %2d", matrix[i + 1][j + 1]) |
| } |
| fmt.Fprintf(os.Stderr, "\n") |
| } |
| } |
| |
| func backtrace(i int, j int, matrix [][]int, op Options) EditScript { |
| if i > 0 && matrix[i - 1][j] + op.DelCost == matrix[i][j] { |
| return append(backtrace(i - 1, j, matrix, op), del) |
| } |
| if j > 0 && matrix[i][j - 1] + op.InsCost == matrix[i][j] { |
| return append(backtrace(i, j - 1, matrix, op), ins) |
| } |
| if i > 0 && j > 0 && matrix[i - 1][j - 1] + op.SubCost == matrix[i][j] { |
| return append(backtrace(i - 1, j - 1, matrix, op), sub) |
| } |
| if i > 0 && j > 0 && matrix[i - 1][j - 1] == matrix[i][j] { |
| return append(backtrace(i - 1, j - 1, matrix, op), match) |
| } |
| return []EditOperation{} |
| } |
| |
| func min(a int, b int) int { |
| if b < a { |
| return b |
| } |
| return a |
| } |
| |
| func max(a int, b int) int { |
| if b > a { |
| return b |
| } |
| return a |
| } |