fix bug in tokenizer
parent
36e434f6b9
commit
a9c3f38d60
|
@ -12,7 +12,7 @@ const special_tokens: any = {
|
||||||
|
|
||||||
const special_tokens_map = new Map<string, number>();
|
const special_tokens_map = new Map<string, number>();
|
||||||
for (const text of Object.keys(special_tokens)) {
|
for (const text of Object.keys(special_tokens)) {
|
||||||
special_tokens_map.set(text, special_tokens_map[text]);
|
special_tokens_map.set(text, special_tokens[text]);
|
||||||
}
|
}
|
||||||
|
|
||||||
const pattern = /('s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/giu;
|
const pattern = /('s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/giu;
|
||||||
|
|
Loading…
Reference in New Issue