Merge branch 'master' into merge-1.9.4-master

Dazu-io · Mar 30, 2020 · a76dacc · a76dacc
2 parents bb61718 + 328b49b
commit a76dacc
Show file tree

Hide file tree

Showing 10 changed files with 526 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -14,6 +14,8 @@
 Rasa is an open source machine learning framework to automate text-and voice-based conversations. With Rasa, you can build contexual assistants on:
 - Facebook Messenger
 - Slack
+- Google Hangouts
+- Webex Teams
 - Microsoft Bot Framework
 - Rocket.Chat
 - Mattermost

diff --git a/changelog/5475.bugfix.rst b/changelog/5475.bugfix.rst
@@ -0,0 +1,5 @@
+One word can just have one entity label.
+
+If you are using, for example, ``ConveRTTokenizer`` words can be split into multiple tokens.
+Our entity extractors assign entity labels per token. So, it might happen, that a word, that was split into two tokens,
+got assigned two different entity labels. This is now fixed. One word can just have one entity label at a time.
diff --git a/changelog/5509.bugfix.rst b/changelog/5509.bugfix.rst
@@ -0,0 +1,5 @@
+An entity label should always cover a complete word.
+
+If you are using, for example, ``ConveRTTokenizer`` words can be split into multiple tokens.
+Our entity extractors assign entity labels per token. So, it might happen, that just a part of a word has
+an entity label. This is now fixed. An entity label always covers a complete word.
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
@@ -329,11 +329,11 @@ def _tag_id_index_mapping(self, training_data: TrainingData) -> Dict[Text, int]:
         if self.component_config[BILOU_FLAG]:
             return bilou_utils.build_tag_id_dict(training_data)
 
-        distinct_tag_ids = set(
+        distinct_tag_ids = {
             e["entity"]
             for example in training_data.entity_examples
             for e in example.get(ENTITIES)
-        ) - {None}
+        } - {None}
 
         tag_id_dict = {
             tag_id: idx for idx, tag_id in enumerate(sorted(distinct_tag_ids), 1)
@@ -662,7 +662,7 @@ def _predict(self, message: Message) -> Optional[Dict[Text, tf.Tensor]]:
                 "There is no trained model: component is either not trained or "
                 "didn't receive enough training data."
             )
-            return
+            return None
 
         # create session data from message and convert it into a batch of 1
         model_data = self._create_model_data([message])
@@ -739,8 +739,9 @@ def _predict_entities(
             message.text, message.get(TOKENS_NAMES[TEXT], []), tags
         )
 
-        extracted = self.add_extractor_name(entities)
-        entities = message.get(ENTITIES, []) + extracted
+        entities = self.add_extractor_name(entities)
+        entities = self.clean_up_entities(message, entities)
+        entities = message.get(ENTITIES, []) + entities
 
         return entities
 
@@ -1191,7 +1192,7 @@ def _combine_sparse_dense_features(
 
     def _features_as_seq_ids(
         self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text
-    ) -> tf.Tensor:
+    ) -> Optional[tf.Tensor]:
         """Creates dense labels for negative sampling."""
 
         # if there are dense features - we can use them
@@ -1206,6 +1207,8 @@ def _features_as_seq_ids(
                     self._tf_layers[f"sparse_to_dense_ids.{name}"](f)
                 )
 
+        return None
+
     def _create_bow(
         self,
         features: List[Union[tf.Tensor, tf.SparseTensor]],

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -156,17 +156,18 @@ def _create_dataset(self, examples: List[Message]) -> List[List[CRFToken]]:
         return dataset
 
     def process(self, message: Message, **kwargs: Any) -> None:
-        extracted = self.add_extractor_name(self.extract_entities(message))
-        message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
+        entities = self.add_extractor_name(self.extract_entities(message))
+        entities = self.clean_up_entities(message, entities)
+        message.set(ENTITIES, message.get(ENTITIES, []) + entities, add_to_output=True)
 
     def extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
         """Take a sentence and return entities in json format"""
 
         if self.ent_tagger is not None:
             text_data = self._from_text_to_crf(message)
             features = self._sentence_to_features(text_data)
-            ents = self.ent_tagger.predict_marginals_single(features)
-            return self._from_crf_to_json(message, ents)
+            entities = self.ent_tagger.predict_marginals_single(features)
+            return self._from_crf_to_json(message, entities)
         else:
             return []
 

diff --git a/rasa/nlu/extractors/duckling_http_extractor.py b/rasa/nlu/extractors/duckling_http_extractor.py
@@ -186,9 +186,8 @@ def process(self, message: Message, **kwargs: Any) -> None:
             )
 
         extracted = self.add_extractor_name(extracted)
-        message.set(
-            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
-        )
+        extracted = self.clean_up_entities(message, extracted)
+        message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
 
     @classmethod
     def load(

diff --git a/rasa/nlu/extractors/extractor.py b/rasa/nlu/extractors/extractor.py
@@ -1,7 +1,8 @@
-from typing import Any, Dict, List, Text, Tuple
+from typing import Any, Dict, List, Text, Tuple, Optional, Union
 
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.components import Component
-from rasa.nlu.constants import EXTRACTOR, ENTITIES
+from rasa.nlu.constants import EXTRACTOR, ENTITIES, TOKENS_NAMES, TEXT
 from rasa.nlu.training_data import Message
 
 
@@ -21,6 +22,250 @@ def add_processor_name(self, entity: Dict[Text, Any]) -> Dict[Text, Any]:
 
         return entity
 
+    def clean_up_entities(
+        self, message: Message, entities: List[Dict[Text, Any]], keep: bool = True
+    ) -> List[Dict[Text, Any]]:
+        """
+        Check if multiple entity labels are assigned to one word or if an entity label
+        is assigned to just a part of a word or if an entity label covers multiple
+        words, but one word just partly.
+
+        This might happen if you are using a tokenizer that splits up words into
+        sub-words and different entity labels are assigned to the individual sub-words.
+        If multiple entity labels are assigned to one word, we keep the entity label
+        with the highest confidence as entity label for that word. If just a part
+        of the word is annotated, that entity label is taken for the complete word.
+        If you set 'keep' to 'False', all entity labels for the word will be removed.
+
+        Args:
+            message: message object
+            entities: list of entities
+            keep:
+                If set to 'True', the entity label with the highest confidence is kept
+                if multiple entity labels are assigned to one word. If set to 'False'
+                all entity labels for that word will be removed.
+
+        Returns:
+            Updated entities.
+        """
+        misaligned_entities = self._get_misaligned_entities(
+            message.get(TOKENS_NAMES[TEXT]), entities
+        )
+
+        entity_indices_to_remove = set()
+
+        for misaligned_entity in misaligned_entities:
+            # entity indices involved in the misalignment
+            entity_indices = misaligned_entity["entity_indices"]
+
+            if not keep:
+                entity_indices_to_remove.update(entity_indices)
+                continue
+
+            idx = self._entity_index_to_keep(entities, entity_indices)
+
+            if idx is None:
+                entity_indices_to_remove.update(entity_indices)
+            else:
+                # keep just one entity
+                entity_indices.remove(idx)
+                entity_indices_to_remove.update(entity_indices)
+
+                # update that entity to cover the complete word(s)
+                entities[idx]["start"] = misaligned_entity["start"]
+                entities[idx]["end"] = misaligned_entity["end"]
+                entities[idx]["value"] = message.text[
+                    misaligned_entity["start"] : misaligned_entity["end"]
+                ]
+
+        # sort indices to remove entries at the end of the list first
+        # to avoid index out of range errors
+        for idx in sorted(entity_indices_to_remove, reverse=True):
+            entities.remove(entities[idx])
+
+        return entities
+
+    def _get_misaligned_entities(
+        self, tokens: List[Token], entities: List[Dict[Text, Any]]
+    ) -> List[Dict[Text, Any]]:
+        """Identify entities and tokens that are misaligned.
+
+        Misaligned entities are those that apply only to a part of a word, i.e.
+        sub-word.
+
+        Args:
+            tokens: list of tokens
+            entities: list of detected entities by the entity extractor
+
+        Returns:
+            Misaligned entities including the start and end position
+            of the final entity in the text and entity indices that are part of this
+            misalignment.
+        """
+        if not tokens:
+            return []
+
+        # group tokens: one token cluster corresponds to one word
+        token_clusters = self._token_clusters(tokens)
+
+        # added for tests, should only happen if tokens are not set or len(tokens) == 1
+        if not token_clusters:
+            return []
+
+        misaligned_entities = []
+        for entity_idx, entity in enumerate(entities):
+            # get all tokens that are covered/touched by the entity
+            entity_tokens = self._tokens_of_entity(entity, token_clusters)
+
+            if len(entity_tokens) == 1:
+                # entity covers exactly one word
+                continue
+
+            # get start and end position of complete word
+            # needed to update the final entity later
+            start_position = entity_tokens[0].start
+            end_position = entity_tokens[-1].end
+
+            # check if an entity was already found that covers the exact same word(s)
+            _idx = self._misaligned_entity_index(
+                misaligned_entities, start_position, end_position
+            )
+
+            if _idx is None:
+                misaligned_entities.append(
+                    {
+                        "start": start_position,
+                        "end": end_position,
+                        "entity_indices": [entity_idx],
+                    }
+                )
+            else:
+                misaligned_entities[_idx]["entity_indices"].append(entity_idx)
+
+        return misaligned_entities
+
+    @staticmethod
+    def _misaligned_entity_index(
+        word_entity_cluster: List[Dict[Text, Union[int, List[int]]]],
+        start_position: int,
+        end_position: int,
+    ) -> Optional[int]:
+        """Get index of matching misaligned entity.
+
+        Args:
+            word_entity_cluster: word entity cluster
+            start_position: start position
+            end_position: end position
+
+        Returns:
+            Index of the misaligned entity that matches the provided start and end
+            position.
+        """
+        for idx, cluster in enumerate(word_entity_cluster):
+            if cluster["start"] == start_position and cluster["end"] == end_position:
+                return idx
+        return None
+
+    @staticmethod
+    def _tokens_of_entity(
+        entity: Dict[Text, Any], token_clusters: List[List[Token]]
+    ) -> List[Token]:
+        """Get all tokens of token clusters that are covered by the entity.
+
+        The entity can cover them completely or just partly.
+
+        Args:
+            entity: the entity
+            token_clusters: list of token clusters
+
+        Returns:
+            Token clusters that belong to the provided entity.
+
+        """
+        entity_tokens = []
+        for token_cluster in token_clusters:
+            entity_starts_inside_cluster = (
+                token_cluster[0].start <= entity["start"] <= token_cluster[-1].end
+            )
+            entity_ends_inside_cluster = (
+                token_cluster[0].start <= entity["end"] <= token_cluster[-1].end
+            )
+
+            if entity_starts_inside_cluster or entity_ends_inside_cluster:
+                entity_tokens += token_cluster
+        return entity_tokens
+
+    @staticmethod
+    def _token_clusters(tokens: List[Token]) -> List[List[Token]]:
+        """Build clusters of tokens that belong to one word.
+
+        Args:
+            tokens: list of tokens
+
+        Returns:
+            Token clusters.
+
+        """
+        # token cluster = list of token indices that belong to one word
+        token_index_clusters = []
+
+        # start at 1 in order to check if current token and previous token belong
+        # to the same word
+        for token_idx in range(1, len(tokens)):
+            previous_token_idx = token_idx - 1
+            # two tokens belong to the same word if there is no other character
+            # between them
+            if tokens[token_idx].start == tokens[previous_token_idx].end:
+                # a word was split into multiple tokens
+                token_cluster_already_exists = (
+                    token_index_clusters
+                    and token_index_clusters[-1][-1] == previous_token_idx
+                )
+                if token_cluster_already_exists:
+                    token_index_clusters[-1].append(token_idx)
+                else:
+                    token_index_clusters.append([previous_token_idx, token_idx])
+            else:
+                # the token corresponds to a single word
+                if token_idx == 1:
+                    token_index_clusters.append([previous_token_idx])
+                token_index_clusters.append([token_idx])
+
+        return [[tokens[idx] for idx in cluster] for cluster in token_index_clusters]
+
+    @staticmethod
+    def _entity_index_to_keep(
+        entities: List[Dict[Text, Any]], entity_indices: List[int]
+    ) -> Optional[int]:
+        """
+        Determine the entity index to keep.
+
+        If we just have one entity index, i.e. candidate, we return the index of that
+        candidate. If we have multiple candidates, we return the index of the entity
+        value with the highest confidence score. If no confidence score is present,
+        no entity label will be kept.
+
+        Args:
+            entities: the full list of entities
+            entity_indices: the entity indices to consider
+
+        Returns: the idx of the entity to keep
+        """
+        if len(entity_indices) == 1:
+            return entity_indices[0]
+
+        confidences = [
+            entities[idx]["confidence"]
+            for idx in entity_indices
+            if "confidence" in entities[idx]
+        ]
+
+        # we don't have confidence values for all entity labels
+        if len(confidences) != len(entity_indices):
+            return None
+
+        return confidences.index(max(confidences))
+
     @staticmethod
     def filter_irrelevant_entities(extracted: list, requested_dimensions: set) -> list:
         """Only return dimensions the user configured"""

diff --git a/rasa/nlu/extractors/mitie_entity_extractor.py b/rasa/nlu/extractors/mitie_entity_extractor.py
@@ -142,9 +142,8 @@ def process(self, message: Message, **kwargs: Any) -> None:
             message.text, self._tokens_without_cls(message), mitie_feature_extractor
         )
         extracted = self.add_extractor_name(ents)
-        message.set(
-            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
-        )
+        extracted = self.clean_up_entities(message, extracted)
+        message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
 
     @classmethod
     def load(

diff --git a/rasa/nlu/extractors/spacy_entity_extractor.py b/rasa/nlu/extractors/spacy_entity_extractor.py
@@ -32,13 +32,12 @@ def process(self, message: Message, **kwargs: Any) -> None:
         spacy_nlp = kwargs.get("spacy_nlp", None)
         doc = spacy_nlp(message.text)
         all_extracted = self.add_extractor_name(self.extract_entities(doc))
+        all_extracted = self.clean_up_entities(message, all_extracted)
         dimensions = self.component_config["dimensions"]
         extracted = SpacyEntityExtractor.filter_irrelevant_entities(
             all_extracted, dimensions
         )
-        message.set(
-            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
-        )
+        message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
 
     @staticmethod
     def extract_entities(doc: "Doc") -> List[Dict[Text, Any]]: