Skip to content

Commit bf79bf9

Browse files
add string tokenization
1 parent e947de8 commit bf79bf9

File tree

1 file changed

+45
-1
lines changed

1 file changed

+45
-1
lines changed

scriptum.js

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6338,7 +6338,7 @@ Parser.hyphenatedAcronym = s => {
63386338

63396339
// parse a hyphenated numerical term (3-times)
63406340

6341-
Parser.hyphenatedNum = s => {
6341+
Parser.numericalTerm = s => {
63426342
if (/[^\p{N}\p{L}\-]/v.test(s)) {
63436343
return Parser.Invalid({
63446344
value: s,
@@ -10120,6 +10120,50 @@ S.splitAscii = s => {
1012010120
};
1012110121

1012210122

10123+
/* Tokenize a string.
10124+
10125+
excl: exclude additional character classes or single characters from replacement
10126+
rules: array of regex/replacement pairs for additional replacements
10127+
10128+
Tokenize the following kinds of tokens:
10129+
• word (foo)
10130+
• hyphenated word (foo-bar)
10131+
• numerical term (3-foo)
10132+
• abbreviation (abbr.)
10133+
• acronym (UNHCR)
10134+
• hyphenated acronym (IT-solution)
10135+
• proper name (O'Bar) */
10136+
10137+
S.tokenize = ({excl, rules}) => s => {
10138+
return This(s)
10139+
.map(s2 => s2.replace(R.gv(`[^\\p{L}\\d\\-.'&\\/${R.escape(excl)}]`), " "))
10140+
10141+
// remove apostrophes used for quoting
10142+
10143+
.map(s2 => s2.replaceAll(/(?<=^| )'+(?=\p{L})/gmv, " "))
10144+
.map(s2 => s2.replaceAll(/(?<=\p{L})'+(?= |$)/gmv, " "))
10145+
10146+
// execute additional rules
10147+
10148+
.map(s2 => {
10149+
for (const [rx, repl] of rules) s2 = s2.replaceAll(rule, repl);
10150+
return s2;
10151+
})
10152+
10153+
// normalize and replace new lines
10154+
10155+
.map(s2 => s2.replaceAll(/\r/g, ""))
10156+
.map(s2 => s2.replaceAll(/\n+/g, " "))
10157+
10158+
// delete redundant spaces
10159+
10160+
.map(s2 => s2.replaceAll(/ {2,}/g, " "))
10161+
.map(s2 => s2.trim())
10162+
.unref
10163+
.split(" ");
10164+
};
10165+
10166+
1012310167
S.splitName = (...titles) => s => {
1012410168
const titles2 = [];
1012510169

0 commit comments

Comments
 (0)