Skip to content

Commit

Permalink
Merge branch 'master' into merge-1.9.4-master
Browse files Browse the repository at this point in the history
  • Loading branch information
dakshvar22 authored Mar 30, 2020
2 parents bb61718 + 328b49b commit a76dacc
Show file tree
Hide file tree
Showing 10 changed files with 526 additions and 21 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
Rasa is an open source machine learning framework to automate text-and voice-based conversations. With Rasa, you can build contexual assistants on:
- Facebook Messenger
- Slack
- Google Hangouts
- Webex Teams
- Microsoft Bot Framework
- Rocket.Chat
- Mattermost
Expand Down
5 changes: 5 additions & 0 deletions changelog/5475.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
One word can just have one entity label.

If you are using, for example, ``ConveRTTokenizer`` words can be split into multiple tokens.
Our entity extractors assign entity labels per token. So, it might happen, that a word, that was split into two tokens,
got assigned two different entity labels. This is now fixed. One word can just have one entity label at a time.
5 changes: 5 additions & 0 deletions changelog/5509.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
An entity label should always cover a complete word.

If you are using, for example, ``ConveRTTokenizer`` words can be split into multiple tokens.
Our entity extractors assign entity labels per token. So, it might happen, that just a part of a word has
an entity label. This is now fixed. An entity label always covers a complete word.
15 changes: 9 additions & 6 deletions rasa/nlu/classifiers/diet_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,11 +329,11 @@ def _tag_id_index_mapping(self, training_data: TrainingData) -> Dict[Text, int]:
if self.component_config[BILOU_FLAG]:
return bilou_utils.build_tag_id_dict(training_data)

distinct_tag_ids = set(
distinct_tag_ids = {
e["entity"]
for example in training_data.entity_examples
for e in example.get(ENTITIES)
) - {None}
} - {None}

tag_id_dict = {
tag_id: idx for idx, tag_id in enumerate(sorted(distinct_tag_ids), 1)
Expand Down Expand Up @@ -662,7 +662,7 @@ def _predict(self, message: Message) -> Optional[Dict[Text, tf.Tensor]]:
"There is no trained model: component is either not trained or "
"didn't receive enough training data."
)
return
return None

# create session data from message and convert it into a batch of 1
model_data = self._create_model_data([message])
Expand Down Expand Up @@ -739,8 +739,9 @@ def _predict_entities(
message.text, message.get(TOKENS_NAMES[TEXT], []), tags
)

extracted = self.add_extractor_name(entities)
entities = message.get(ENTITIES, []) + extracted
entities = self.add_extractor_name(entities)
entities = self.clean_up_entities(message, entities)
entities = message.get(ENTITIES, []) + entities

return entities

Expand Down Expand Up @@ -1191,7 +1192,7 @@ def _combine_sparse_dense_features(

def _features_as_seq_ids(
self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text
) -> tf.Tensor:
) -> Optional[tf.Tensor]:
"""Creates dense labels for negative sampling."""

# if there are dense features - we can use them
Expand All @@ -1206,6 +1207,8 @@ def _features_as_seq_ids(
self._tf_layers[f"sparse_to_dense_ids.{name}"](f)
)

return None

def _create_bow(
self,
features: List[Union[tf.Tensor, tf.SparseTensor]],
Expand Down
9 changes: 5 additions & 4 deletions rasa/nlu/extractors/crf_entity_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,17 +156,18 @@ def _create_dataset(self, examples: List[Message]) -> List[List[CRFToken]]:
return dataset

def process(self, message: Message, **kwargs: Any) -> None:
extracted = self.add_extractor_name(self.extract_entities(message))
message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
entities = self.add_extractor_name(self.extract_entities(message))
entities = self.clean_up_entities(message, entities)
message.set(ENTITIES, message.get(ENTITIES, []) + entities, add_to_output=True)

def extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
"""Take a sentence and return entities in json format"""

if self.ent_tagger is not None:
text_data = self._from_text_to_crf(message)
features = self._sentence_to_features(text_data)
ents = self.ent_tagger.predict_marginals_single(features)
return self._from_crf_to_json(message, ents)
entities = self.ent_tagger.predict_marginals_single(features)
return self._from_crf_to_json(message, entities)
else:
return []

Expand Down
5 changes: 2 additions & 3 deletions rasa/nlu/extractors/duckling_http_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,8 @@ def process(self, message: Message, **kwargs: Any) -> None:
)

extracted = self.add_extractor_name(extracted)
message.set(
ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
)
extracted = self.clean_up_entities(message, extracted)
message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)

@classmethod
def load(
Expand Down
249 changes: 247 additions & 2 deletions rasa/nlu/extractors/extractor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import Any, Dict, List, Text, Tuple
from typing import Any, Dict, List, Text, Tuple, Optional, Union

from rasa.nlu.tokenizers.tokenizer import Token
from rasa.nlu.components import Component
from rasa.nlu.constants import EXTRACTOR, ENTITIES
from rasa.nlu.constants import EXTRACTOR, ENTITIES, TOKENS_NAMES, TEXT
from rasa.nlu.training_data import Message


Expand All @@ -21,6 +22,250 @@ def add_processor_name(self, entity: Dict[Text, Any]) -> Dict[Text, Any]:

return entity

def clean_up_entities(
self, message: Message, entities: List[Dict[Text, Any]], keep: bool = True
) -> List[Dict[Text, Any]]:
"""
Check if multiple entity labels are assigned to one word or if an entity label
is assigned to just a part of a word or if an entity label covers multiple
words, but one word just partly.
This might happen if you are using a tokenizer that splits up words into
sub-words and different entity labels are assigned to the individual sub-words.
If multiple entity labels are assigned to one word, we keep the entity label
with the highest confidence as entity label for that word. If just a part
of the word is annotated, that entity label is taken for the complete word.
If you set 'keep' to 'False', all entity labels for the word will be removed.
Args:
message: message object
entities: list of entities
keep:
If set to 'True', the entity label with the highest confidence is kept
if multiple entity labels are assigned to one word. If set to 'False'
all entity labels for that word will be removed.
Returns:
Updated entities.
"""
misaligned_entities = self._get_misaligned_entities(
message.get(TOKENS_NAMES[TEXT]), entities
)

entity_indices_to_remove = set()

for misaligned_entity in misaligned_entities:
# entity indices involved in the misalignment
entity_indices = misaligned_entity["entity_indices"]

if not keep:
entity_indices_to_remove.update(entity_indices)
continue

idx = self._entity_index_to_keep(entities, entity_indices)

if idx is None:
entity_indices_to_remove.update(entity_indices)
else:
# keep just one entity
entity_indices.remove(idx)
entity_indices_to_remove.update(entity_indices)

# update that entity to cover the complete word(s)
entities[idx]["start"] = misaligned_entity["start"]
entities[idx]["end"] = misaligned_entity["end"]
entities[idx]["value"] = message.text[
misaligned_entity["start"] : misaligned_entity["end"]
]

# sort indices to remove entries at the end of the list first
# to avoid index out of range errors
for idx in sorted(entity_indices_to_remove, reverse=True):
entities.remove(entities[idx])

return entities

def _get_misaligned_entities(
self, tokens: List[Token], entities: List[Dict[Text, Any]]
) -> List[Dict[Text, Any]]:
"""Identify entities and tokens that are misaligned.
Misaligned entities are those that apply only to a part of a word, i.e.
sub-word.
Args:
tokens: list of tokens
entities: list of detected entities by the entity extractor
Returns:
Misaligned entities including the start and end position
of the final entity in the text and entity indices that are part of this
misalignment.
"""
if not tokens:
return []

# group tokens: one token cluster corresponds to one word
token_clusters = self._token_clusters(tokens)

# added for tests, should only happen if tokens are not set or len(tokens) == 1
if not token_clusters:
return []

misaligned_entities = []
for entity_idx, entity in enumerate(entities):
# get all tokens that are covered/touched by the entity
entity_tokens = self._tokens_of_entity(entity, token_clusters)

if len(entity_tokens) == 1:
# entity covers exactly one word
continue

# get start and end position of complete word
# needed to update the final entity later
start_position = entity_tokens[0].start
end_position = entity_tokens[-1].end

# check if an entity was already found that covers the exact same word(s)
_idx = self._misaligned_entity_index(
misaligned_entities, start_position, end_position
)

if _idx is None:
misaligned_entities.append(
{
"start": start_position,
"end": end_position,
"entity_indices": [entity_idx],
}
)
else:
misaligned_entities[_idx]["entity_indices"].append(entity_idx)

return misaligned_entities

@staticmethod
def _misaligned_entity_index(
word_entity_cluster: List[Dict[Text, Union[int, List[int]]]],
start_position: int,
end_position: int,
) -> Optional[int]:
"""Get index of matching misaligned entity.
Args:
word_entity_cluster: word entity cluster
start_position: start position
end_position: end position
Returns:
Index of the misaligned entity that matches the provided start and end
position.
"""
for idx, cluster in enumerate(word_entity_cluster):
if cluster["start"] == start_position and cluster["end"] == end_position:
return idx
return None

@staticmethod
def _tokens_of_entity(
entity: Dict[Text, Any], token_clusters: List[List[Token]]
) -> List[Token]:
"""Get all tokens of token clusters that are covered by the entity.
The entity can cover them completely or just partly.
Args:
entity: the entity
token_clusters: list of token clusters
Returns:
Token clusters that belong to the provided entity.
"""
entity_tokens = []
for token_cluster in token_clusters:
entity_starts_inside_cluster = (
token_cluster[0].start <= entity["start"] <= token_cluster[-1].end
)
entity_ends_inside_cluster = (
token_cluster[0].start <= entity["end"] <= token_cluster[-1].end
)

if entity_starts_inside_cluster or entity_ends_inside_cluster:
entity_tokens += token_cluster
return entity_tokens

@staticmethod
def _token_clusters(tokens: List[Token]) -> List[List[Token]]:
"""Build clusters of tokens that belong to one word.
Args:
tokens: list of tokens
Returns:
Token clusters.
"""
# token cluster = list of token indices that belong to one word
token_index_clusters = []

# start at 1 in order to check if current token and previous token belong
# to the same word
for token_idx in range(1, len(tokens)):
previous_token_idx = token_idx - 1
# two tokens belong to the same word if there is no other character
# between them
if tokens[token_idx].start == tokens[previous_token_idx].end:
# a word was split into multiple tokens
token_cluster_already_exists = (
token_index_clusters
and token_index_clusters[-1][-1] == previous_token_idx
)
if token_cluster_already_exists:
token_index_clusters[-1].append(token_idx)
else:
token_index_clusters.append([previous_token_idx, token_idx])
else:
# the token corresponds to a single word
if token_idx == 1:
token_index_clusters.append([previous_token_idx])
token_index_clusters.append([token_idx])

return [[tokens[idx] for idx in cluster] for cluster in token_index_clusters]

@staticmethod
def _entity_index_to_keep(
entities: List[Dict[Text, Any]], entity_indices: List[int]
) -> Optional[int]:
"""
Determine the entity index to keep.
If we just have one entity index, i.e. candidate, we return the index of that
candidate. If we have multiple candidates, we return the index of the entity
value with the highest confidence score. If no confidence score is present,
no entity label will be kept.
Args:
entities: the full list of entities
entity_indices: the entity indices to consider
Returns: the idx of the entity to keep
"""
if len(entity_indices) == 1:
return entity_indices[0]

confidences = [
entities[idx]["confidence"]
for idx in entity_indices
if "confidence" in entities[idx]
]

# we don't have confidence values for all entity labels
if len(confidences) != len(entity_indices):
return None

return confidences.index(max(confidences))

@staticmethod
def filter_irrelevant_entities(extracted: list, requested_dimensions: set) -> list:
"""Only return dimensions the user configured"""
Expand Down
5 changes: 2 additions & 3 deletions rasa/nlu/extractors/mitie_entity_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,8 @@ def process(self, message: Message, **kwargs: Any) -> None:
message.text, self._tokens_without_cls(message), mitie_feature_extractor
)
extracted = self.add_extractor_name(ents)
message.set(
ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
)
extracted = self.clean_up_entities(message, extracted)
message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)

@classmethod
def load(
Expand Down
5 changes: 2 additions & 3 deletions rasa/nlu/extractors/spacy_entity_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,12 @@ def process(self, message: Message, **kwargs: Any) -> None:
spacy_nlp = kwargs.get("spacy_nlp", None)
doc = spacy_nlp(message.text)
all_extracted = self.add_extractor_name(self.extract_entities(doc))
all_extracted = self.clean_up_entities(message, all_extracted)
dimensions = self.component_config["dimensions"]
extracted = SpacyEntityExtractor.filter_irrelevant_entities(
all_extracted, dimensions
)
message.set(
ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
)
message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)

@staticmethod
def extract_entities(doc: "Doc") -> List[Dict[Text, Any]]:
Expand Down
Loading

0 comments on commit a76dacc

Please sign in to comment.