Skip to content

Commit

Permalink
[es] Fix bug in the splitter
Browse files Browse the repository at this point in the history
Some verb forms were incorrectly splitted at the beginning of the
sentence if they contained accented vowels in uppercase.

example: "Úsalo con precaución."
  • Loading branch information
sdocio committed Jul 10, 2020
1 parent 9d47350 commit b38f047
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 5 deletions.
14 changes: 11 additions & 3 deletions t/es/01_splitter.t
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use warnings;
use strict;
use utf8;
use open ':std', ':encoding(utf8)';
use Test::More tests => 9;
use Test::More tests => 10;
use lib '.';

BEGIN {
Expand All @@ -43,7 +43,11 @@ my $tokens = [
'Cómpralo', 'para', 'mejorar', 'y', 'transformar', 'tus', 'ensaladas',
'favoritas', ',', 'y', 'siéntete', 'bien', 'al', 'dárselo', 'a',
'tu', 'familia', '.', ''
]
],
[
'Úsalo', 'con', 'precaución', ',', 'ya', 'que', 'su', 'fórmula',
'incluye', 'algunas', 'sustancias', 'químicas', '.', ''
],
];
my $expected_tokens = [
[
Expand All @@ -62,7 +66,11 @@ my $expected_tokens = [
'Compra', 'lo', 'para', 'mejorar', 'y', 'transformar', 'tus',
'ensaladas', 'favoritas', ',', 'y', 'siente', 'te', 'bien', 'a',
'el', 'dar', 'se', 'lo', 'a', 'tu', 'familia', '.', ''
]
],
[
'Usa', 'lo', 'con', 'precaución', ',', 'ya_que', 'su', 'fórmula',
'incluye', 'algunas', 'sustancias', 'químicas', '.', ''
],
];

my $splitted_with_locs = [
Expand Down
4 changes: 2 additions & 2 deletions tagger/es/splitter-es_exe.perl
Original file line number Diff line number Diff line change
Expand Up @@ -258,15 +258,15 @@ sub splitter {
}
}
##imperativo 2 pessoa singular: cómelo (falta tratar monósilabos: vete, dale...)
if (!$found && $token =~ /^(\w+)($pron)$/i && $token =~ /[áéíóú]/ && $token !~ /mosnos$/ && $token ne "séase") { ##nom separar séase (de "o seáse")
if (!$found && $token =~ /^(\w+)($pron)$/i && $token =~ /[áéíóú]/i && $token !~ /mosnos$/ && $token ne "séase") { ##nom separar séase (de "o seáse")

if ($token =~ /nos$/i) {
($verb,$tmp1) = $token =~ /^(\w+)(nos)$/i;
}
else {
($verb,$tmp1) = $token =~ /^(\w+)($pron)$/i;
}
$verb =~ y/áéíóú/aeiou/;
$verb =~ y/áéíóúÁÉÍÓÚ/aeiouAEIOU/;
#print STDERR "----#$verb# #$tmp1#\n";
if ($Imp{lowercase($verb)}) {

Expand Down

0 comments on commit b38f047

Please sign in to comment.