@@ -6338,7 +6338,7 @@ Parser.hyphenatedAcronym = s => {
63386338
63396339// parse a hyphenated numerical term (3-times)
63406340
6341- Parser . hyphenatedNum = s => {
6341+ Parser . numericalTerm = s => {
63426342 if ( / [ ^ \p{ N} \p{ L} \- ] / v. test ( s ) ) {
63436343 return Parser . Invalid ( {
63446344 value : s ,
@@ -10120,6 +10120,50 @@ S.splitAscii = s => {
1012010120} ;
1012110121
1012210122
10123+ /* Tokenize a string.
10124+
10125+ excl: exclude additional character classes or single characters from replacement
10126+ rules: array of regex/replacement pairs for additional replacements
10127+
10128+ Tokenize the following kinds of tokens:
10129+ • word (foo)
10130+ • hyphenated word (foo-bar)
10131+ • numerical term (3-foo)
10132+ • abbreviation (abbr.)
10133+ • acronym (UNHCR)
10134+ • hyphenated acronym (IT-solution)
10135+ • proper name (O'Bar) */
10136+
10137+ S . tokenize = ( { excl, rules} ) => s => {
10138+ return This ( s )
10139+ . map ( s2 => s2 . replace ( R . gv ( `[^\\p{L}\\d\\-.'&\\/${ R . escape ( excl ) } ]` ) , " " ) )
10140+
10141+ // remove apostrophes used for quoting
10142+
10143+ . map ( s2 => s2 . replaceAll ( / (?< = ^ | ) ' + (? = \p{ L} ) / gmv, " " ) )
10144+ . map ( s2 => s2 . replaceAll ( / (?< = \p{ L} ) ' + (? = | $ ) / gmv, " " ) )
10145+
10146+ // execute additional rules
10147+
10148+ . map ( s2 => {
10149+ for ( const [ rx , repl ] of rules ) s2 = s2 . replaceAll ( rule , repl ) ;
10150+ return s2 ;
10151+ } )
10152+
10153+ // normalize and replace new lines
10154+
10155+ . map ( s2 => s2 . replaceAll ( / \r / g, "" ) )
10156+ . map ( s2 => s2 . replaceAll ( / \n + / g, " " ) )
10157+
10158+ // delete redundant spaces
10159+
10160+ . map ( s2 => s2 . replaceAll ( / { 2 , } / g, " " ) )
10161+ . map ( s2 => s2 . trim ( ) )
10162+ . unref
10163+ . split ( " " ) ;
10164+ } ;
10165+
10166+
1012310167S . splitName = ( ...titles ) => s => {
1012410168 const titles2 = [ ] ;
1012510169
0 commit comments