|
1 |
| -import Diff from './base'; |
| 1 | +import Diff, { DiffOptions } from './base'; |
2 | 2 | import { longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap, leadingWs, trailingWs } from '../util/string';
|
3 | 3 |
|
4 | 4 | // Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode
|
@@ -48,94 +48,98 @@ const extendedWordChars = 'a-zA-Z0-9_\\u{C0}-\\u{FF}\\u{D8}-\\u{F6}\\u{F8}-\\u{2
|
48 | 48 | // tokens.
|
49 | 49 | const tokenizeIncludingWhitespace = new RegExp(`[${extendedWordChars}]+|\\s+|[^${extendedWordChars}]`, 'ug');
|
50 | 50 |
|
51 |
| -export const wordDiff = new Diff(); |
52 |
| -wordDiff.equals = function(left, right, options) { |
53 |
| - if (options.ignoreCase) { |
54 |
| - left = left.toLowerCase(); |
55 |
| - right = right.toLowerCase(); |
56 |
| - } |
57 |
| - |
58 |
| - return left.trim() === right.trim(); |
59 |
| -}; |
60 | 51 |
|
61 |
| -wordDiff.tokenize = function(value, options = {}) { |
62 |
| - let parts; |
63 |
| - if (options.intlSegmenter) { |
64 |
| - if (options.intlSegmenter.resolvedOptions().granularity != 'word') { |
65 |
| - throw new Error('The segmenter passed must have a granularity of "word"'); |
| 52 | +class WordDiff extends Diff<string, string> { |
| 53 | + protected equals(left: string, right: string, options: DiffOptions<string>) { |
| 54 | + if (options.ignoreCase) { |
| 55 | + left = left.toLowerCase(); |
| 56 | + right = right.toLowerCase(); |
66 | 57 | }
|
67 |
| - parts = Array.from(options.intlSegmenter.segment(value), segment => segment.segment); |
68 |
| - } else { |
69 |
| - parts = value.match(tokenizeIncludingWhitespace) || []; |
| 58 | + |
| 59 | + return left.trim() === right.trim(); |
70 | 60 | }
|
71 |
| - const tokens = []; |
72 |
| - let prevPart = null; |
73 |
| - parts.forEach(part => { |
74 |
| - if ((/\s/).test(part)) { |
75 |
| - if (prevPart == null) { |
76 |
| - tokens.push(part); |
77 |
| - } else { |
78 |
| - tokens.push(tokens.pop() + part); |
79 |
| - } |
80 |
| - } else if ((/\s/).test(prevPart)) { |
81 |
| - if (tokens[tokens.length - 1] == prevPart) { |
82 |
| - tokens.push(tokens.pop() + part); |
83 |
| - } else { |
84 |
| - tokens.push(prevPart + part); |
| 61 | + |
| 62 | + protected tokenize(value: string, options: DiffOptions<string> = {}) { |
| 63 | + let parts; |
| 64 | + if (options.intlSegmenter) { |
| 65 | + if (options.intlSegmenter.resolvedOptions().granularity != 'word') { |
| 66 | + throw new Error('The segmenter passed must have a granularity of "word"'); |
85 | 67 | }
|
| 68 | + parts = Array.from(options.intlSegmenter.segment(value), segment => segment.segment); |
86 | 69 | } else {
|
87 |
| - tokens.push(part); |
| 70 | + parts = value.match(tokenizeIncludingWhitespace) || []; |
88 | 71 | }
|
| 72 | + const tokens: string[] = []; |
| 73 | + let prevPart = null; |
| 74 | + parts.forEach(part => { |
| 75 | + if ((/\s/).test(part)) { |
| 76 | + if (prevPart == null) { |
| 77 | + tokens.push(part); |
| 78 | + } else { |
| 79 | + tokens.push(tokens.pop() + part); |
| 80 | + } |
| 81 | + } else if (prevPart != null && (/\s/).test(prevPart)) { |
| 82 | + if (tokens[tokens.length - 1] == prevPart) { |
| 83 | + tokens.push(tokens.pop() + part); |
| 84 | + } else { |
| 85 | + tokens.push(prevPart + part); |
| 86 | + } |
| 87 | + } else { |
| 88 | + tokens.push(part); |
| 89 | + } |
89 | 90 |
|
90 |
| - prevPart = part; |
91 |
| - }); |
92 |
| - return tokens; |
93 |
| -}; |
94 |
| - |
95 |
| -wordDiff.join = function(tokens) { |
96 |
| - // Tokens being joined here will always have appeared consecutively in the |
97 |
| - // same text, so we can simply strip off the leading whitespace from all the |
98 |
| - // tokens except the first (and except any whitespace-only tokens - but such |
99 |
| - // a token will always be the first and only token anyway) and then join them |
100 |
| - // and the whitespace around words and punctuation will end up correct. |
101 |
| - return tokens.map((token, i) => { |
102 |
| - if (i == 0) { |
103 |
| - return token; |
104 |
| - } else { |
105 |
| - return token.replace((/^\s+/), ''); |
106 |
| - } |
107 |
| - }).join(''); |
108 |
| -}; |
| 91 | + prevPart = part; |
| 92 | + }); |
| 93 | + return tokens; |
| 94 | + } |
109 | 95 |
|
110 |
| -wordDiff.postProcess = function(changes, options) { |
111 |
| - if (!changes || options.oneChangePerToken) { |
112 |
| - return changes; |
| 96 | + protected join(tokens) { |
| 97 | + // Tokens being joined here will always have appeared consecutively in the |
| 98 | + // same text, so we can simply strip off the leading whitespace from all the |
| 99 | + // tokens except the first (and except any whitespace-only tokens - but such |
| 100 | + // a token will always be the first and only token anyway) and then join them |
| 101 | + // and the whitespace around words and punctuation will end up correct. |
| 102 | + return tokens.map((token, i) => { |
| 103 | + if (i == 0) { |
| 104 | + return token; |
| 105 | + } else { |
| 106 | + return token.replace((/^\s+/), ''); |
| 107 | + } |
| 108 | + }).join(''); |
113 | 109 | }
|
114 | 110 |
|
115 |
| - let lastKeep = null; |
116 |
| - // Change objects representing any insertion or deletion since the last |
117 |
| - // "keep" change object. There can be at most one of each. |
118 |
| - let insertion = null; |
119 |
| - let deletion = null; |
120 |
| - changes.forEach(change => { |
121 |
| - if (change.added) { |
122 |
| - insertion = change; |
123 |
| - } else if (change.removed) { |
124 |
| - deletion = change; |
125 |
| - } else { |
126 |
| - if (insertion || deletion) { // May be false at start of text |
127 |
| - dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, change); |
| 111 | + protected postProcess(changes, options) { |
| 112 | + if (!changes || options.oneChangePerToken) { |
| 113 | + return changes; |
| 114 | + } |
| 115 | + |
| 116 | + let lastKeep = null; |
| 117 | + // Change objects representing any insertion or deletion since the last |
| 118 | + // "keep" change object. There can be at most one of each. |
| 119 | + let insertion = null; |
| 120 | + let deletion = null; |
| 121 | + changes.forEach(change => { |
| 122 | + if (change.added) { |
| 123 | + insertion = change; |
| 124 | + } else if (change.removed) { |
| 125 | + deletion = change; |
| 126 | + } else { |
| 127 | + if (insertion || deletion) { // May be false at start of text |
| 128 | + dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, change); |
| 129 | + } |
| 130 | + lastKeep = change; |
| 131 | + insertion = null; |
| 132 | + deletion = null; |
128 | 133 | }
|
129 |
| - lastKeep = change; |
130 |
| - insertion = null; |
131 |
| - deletion = null; |
| 134 | + }); |
| 135 | + if (insertion || deletion) { |
| 136 | + dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null); |
132 | 137 | }
|
133 |
| - }); |
134 |
| - if (insertion || deletion) { |
135 |
| - dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null); |
| 138 | + return changes; |
136 | 139 | }
|
137 |
| - return changes; |
138 |
| -}; |
| 140 | +} |
| 141 | + |
| 142 | +export const wordDiff = new WordDiff(); |
139 | 143 |
|
140 | 144 | export function diffWords(oldStr, newStr, options) {
|
141 | 145 | // This option has never been documented and never will be (it's clearer to
|
@@ -273,16 +277,19 @@ function dedupeWhitespaceInChangeObjects(startKeep, deletion, insertion, endKeep
|
273 | 277 | }
|
274 | 278 |
|
275 | 279 |
|
276 |
| -export const wordWithSpaceDiff = new Diff(); |
277 |
| -wordWithSpaceDiff.tokenize = function(value) { |
278 |
| - // Slightly different to the tokenizeIncludingWhitespace regex used above in |
279 |
| - // that this one treats each individual newline as a distinct tokens, rather |
280 |
| - // than merging them into other surrounding whitespace. This was requested |
281 |
| - // in https://github.com/kpdecker/jsdiff/issues/180 & |
282 |
| - // https://github.com/kpdecker/jsdiff/issues/211 |
283 |
| - const regex = new RegExp(`(\\r?\\n)|[${extendedWordChars}]+|[^\\S\\n\\r]+|[^${extendedWordChars}]`, 'ug'); |
284 |
| - return value.match(regex) || []; |
285 |
| -}; |
| 280 | +class WordsWithSpaceDiff extends Diff<string, string> { |
| 281 | + protected tokenize(value: string) { |
| 282 | + // Slightly different to the tokenizeIncludingWhitespace regex used above in |
| 283 | + // that this one treats each individual newline as a distinct tokens, rather |
| 284 | + // than merging them into other surrounding whitespace. This was requested |
| 285 | + // in https://github.com/kpdecker/jsdiff/issues/180 & |
| 286 | + // https://github.com/kpdecker/jsdiff/issues/211 |
| 287 | + const regex = new RegExp(`(\\r?\\n)|[${extendedWordChars}]+|[^\\S\\n\\r]+|[^${extendedWordChars}]`, 'ug'); |
| 288 | + return value.match(regex) || []; |
| 289 | + } |
| 290 | +} |
| 291 | + |
| 292 | +export const wordsWithSpaceDiff = new WordsWithSpaceDiff(); |
286 | 293 | export function diffWordsWithSpace(oldStr, newStr, options) {
|
287 |
| - return wordWithSpaceDiff.diff(oldStr, newStr, options); |
| 294 | + return wordsWithSpaceDiff.diff(oldStr, newStr, options); |
288 | 295 | }
|
0 commit comments