Skip to content

Commit b39538e

Browse files
authored
Merge pull request #790 from snipsco/release/0.19.6
Release 0.19.6
2 parents 52b17fb + 127affd commit b39538e

11 files changed

Lines changed: 338 additions & 32 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
# Changelog
22
All notable changes to this project will be documented in this file.
33

4+
## [0.19.6]
5+
### Fixed
6+
- Raise an error when using unknown intents in intents filter [#788](https://github.com/snipsco/snips-nlu/pull/788)
7+
- Fix issue with stop words in `DeterministicIntentParser` [#789](https://github.com/snipsco/snips-nlu/pull/789)
8+
49
## [0.19.5]
510
### Added
611
- Advanced inference logging in the `CRFSlotFiller` [#776](https://github.com/snipsco/snips-nlu/pull/776)
@@ -264,6 +269,7 @@ several commands.
264269
- Fix compiling issue with `bindgen` dependency when installing from source
265270
- Fix issue in `CRFSlotFiller` when handling builtin entities
266271

272+
[0.19.6]: https://github.com/snipsco/snips-nlu/compare/0.19.5...0.19.6
267273
[0.19.5]: https://github.com/snipsco/snips-nlu/compare/0.19.4...0.19.5
268274
[0.19.4]: https://github.com/snipsco/snips-nlu/compare/0.19.3...0.19.4
269275
[0.19.3]: https://github.com/snipsco/snips-nlu/compare/0.19.2...0.19.3

CONTRIBUTORS.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
Contributors
22
============
33

4-
This is a list of everyone who has made significant contributions to Snips NLU, in alphabetical order. Thanks a lot for the great work!
4+
This is a list of everyone who has made contributions to Snips NLU, in alphabetical order. Thanks a lot for the great work!
55

66
* `Alice Coucke <https://github.com/choufractal>`_
7+
* `cclauss <https://github.com/cclauss>`_
78
* `ddorian <https://github.com/ddorian>`_
89
* `Josh Meyer <https://github.com/JRMeyer>`_
910
* `Matthieu Brouillard <https://github.com/McFoggy>`_

setup.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,12 @@
6060
author=about["__author__"],
6161
author_email=about["__email__"],
6262
license=about["__license__"],
63-
url=about["__uri__"],
63+
url=about["__github_url__"],
64+
project_urls={
65+
"Documentation": about["__doc_url__"],
66+
"Source": about["__github_url__"],
67+
"Tracker": about["__tracker_url__"],
68+
},
6469
install_requires=required,
6570
extras_require=extras_require,
6671
classifiers=[
@@ -70,8 +75,11 @@
7075
"Programming Language :: Python :: 3.5",
7176
"Programming Language :: Python :: 3.6",
7277
"Programming Language :: Python :: 3.7",
78+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
7379
],
80+
keywords="nlu nlp language machine learning text processing intent",
7481
packages=packages,
82+
python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4',
7583
include_package_data=True,
7684
entry_points={
7785
"console_scripts": [

snips_nlu/__about__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@
66

77
__title__ = "snips_nlu"
88
__summary__ = "Snips Natural Language Understanding library"
9-
__uri__ = "https://snips-nlu.readthedocs.io"
9+
__github_url__ = "https://github.com/snipsco/snips-nlu"
10+
__doc_url__ = "https://snips-nlu.readthedocs.io"
11+
__tracker_url__ = "https://github.com/snipsco/snips-nlu/issues"
1012
__author__ = "Clement Doumouro, Adrien Ball"
1113
__email__ = "clement.doumouro@snips.ai, adrien.ball@snips.ai"
1214
__license__ = "Apache License, Version 2.0"
1315

14-
__version__ = "0.19.5"
16+
__version__ = "0.19.6"
1517
__model_version__ = "0.19.0"
1618

1719
__download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download"

snips_nlu/common/utils.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import numpy as np
1212
import pkg_resources
13-
from future.utils import PY3
13+
from future.utils import text_type
1414

1515
from snips_nlu.constants import (
1616
END, START, RES_MATCH_RANGE, ENTITY_KIND, RES_VALUE)
@@ -94,17 +94,12 @@ def json_string(json_object, indent=2, sort_keys=True):
9494

9595

9696
def unicode_string(string):
97-
if PY3:
98-
unicode_type = str
99-
else:
100-
unicode_type = unicode
101-
102-
if isinstance(string, unicode_type):
97+
if isinstance(string, text_type):
10398
return string
10499
if isinstance(string, bytes):
105100
return string.decode("utf8")
106101
if isinstance(string, newstr):
107-
return unicode_type(string)
102+
return text_type(string)
108103
if isinstance(string, newbytes):
109104
string = bytes(string).decode("utf8")
110105

snips_nlu/dataset/utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import unicode_literals
22

33
from future.utils import iteritems, itervalues
4+
from snips_nlu_utils import normalize
45
from yaml import Loader, SafeLoader
56

67
from snips_nlu.constants import (
@@ -41,6 +42,18 @@ def extract_intent_entities(dataset, entity_filter=None):
4142
return intent_entities
4243

4344

45+
def extract_entity_values(dataset, apply_normalization):
46+
entities_per_intent = {intent: set() for intent in dataset[INTENTS]}
47+
intent_entities = extract_intent_entities(dataset)
48+
for intent, entities in iteritems(intent_entities):
49+
for entity in entities:
50+
entity_values = set(dataset[ENTITIES][entity][UTTERANCES])
51+
if apply_normalization:
52+
entity_values = {normalize(v) for v in entity_values}
53+
entities_per_intent[intent].update(entity_values)
54+
return entities_per_intent
55+
56+
4457
def get_text_from_chunks(chunks):
4558
return "".join(chunk[TEXT] for chunk in chunks)
4659

snips_nlu/intent_parser/deterministic_intent_parser.py

Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
RES_MATCH_RANGE, RES_SLOTS, RES_VALUE, SLOT_NAME, START, TEXT, UTTERANCES,
2323
RES_PROBA)
2424
from snips_nlu.dataset import validate_and_format_dataset
25+
from snips_nlu.dataset.utils import extract_entity_values
2526
from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity
2627
from snips_nlu.exceptions import IntentNotFoundError, LoadingError
2728
from snips_nlu.intent_parser.intent_parser import IntentParser
@@ -55,10 +56,11 @@ def __init__(self, config=None, **shared):
5556
self._language = None
5657
self._slot_names_to_entities = None
5758
self._group_names_to_slot_names = None
59+
self._stop_words = None
60+
self._stop_words_whitelist = None
5861
self.slot_names_to_group_names = None
5962
self.regexes_per_intent = None
6063
self.entity_scopes = None
61-
self.stop_words = None
6264

6365
@property
6466
def language(self):
@@ -68,12 +70,12 @@ def language(self):
6870
def language(self, value):
6971
self._language = value
7072
if value is None:
71-
self.stop_words = None
73+
self._stop_words = None
7274
else:
7375
if self.config.ignore_stop_words:
74-
self.stop_words = get_stop_words(self.resources)
76+
self._stop_words = get_stop_words(self.resources)
7577
else:
76-
self.stop_words = set()
78+
self._stop_words = set()
7779

7880
@property
7981
def slot_names_to_entities(self):
@@ -142,13 +144,15 @@ def fit(self, dataset, force_retrain=True):
142144
self.slot_names_to_entities = get_slot_name_mappings(dataset)
143145
self.group_names_to_slot_names = _get_group_names_to_slot_names(
144146
self.slot_names_to_entities)
147+
self._stop_words_whitelist = _get_stop_words_whitelist(
148+
dataset, self._stop_words)
145149

146150
# Do not use ambiguous patterns that appear in more than one intent
147151
all_patterns = set()
148152
ambiguous_patterns = set()
149153
intent_patterns = dict()
150154
for intent_name, intent in iteritems(dataset[INTENTS]):
151-
patterns = self._generate_patterns(intent[UTTERANCES],
155+
patterns = self._generate_patterns(intent_name, intent[UTTERANCES],
152156
entity_placeholders)
153157
patterns = [p for p in patterns
154158
if len(p) < self.config.max_pattern_length]
@@ -221,7 +225,6 @@ def placeholder_fn(entity_name):
221225
return _get_entity_name_placeholder(entity_name, self.language)
222226

223227
results = []
224-
cleaned_text = self._preprocess_text(text)
225228

226229
for intent, entity_scope in iteritems(self.entity_scopes):
227230
if intents is not None and intent not in intents:
@@ -233,7 +236,9 @@ def placeholder_fn(entity_name):
233236
all_entities = builtin_entities + custom_entities
234237
mapping, processed_text = replace_entities_with_placeholders(
235238
text, all_entities, placeholder_fn=placeholder_fn)
236-
cleaned_processed_text = self._preprocess_text(processed_text)
239+
cleaned_text = self._preprocess_text(text, intent)
240+
cleaned_processed_text = self._preprocess_text(processed_text,
241+
intent)
237242
for regex in self.regexes_per_intent[intent]:
238243
res = self._get_matching_result(text, cleaned_processed_text,
239244
regex, intent, mapping)
@@ -300,14 +305,19 @@ def get_slots(self, text, intent):
300305
slots = []
301306
return slots
302307

303-
def _preprocess_text(self, string):
308+
def _get_intent_stop_words(self, intent):
309+
whitelist = self._stop_words_whitelist.get(intent, set())
310+
return self._stop_words.difference(whitelist)
311+
312+
def _preprocess_text(self, string, intent):
304313
"""Replaces stop words and characters that are tokenized out by
305314
whitespaces"""
306315
tokens = tokenize(string, self.language)
307316
current_idx = 0
308317
cleaned_string = ""
318+
stop_words = self._get_intent_stop_words(intent)
309319
for token in tokens:
310-
if self.stop_words and normalize_token(token) in self.stop_words:
320+
if stop_words and normalize_token(token) in stop_words:
311321
token.value = "".join(" " for _ in range(len(token.value)))
312322
prefix_length = token.start - current_idx
313323
cleaned_string += "".join((" " for _ in range(prefix_length)))
@@ -352,18 +362,21 @@ def _get_matching_result(self, text, processed_text, regex, intent,
352362
key=lambda s: s[RES_MATCH_RANGE][START])
353363
return extraction_result(parsed_intent, parsed_slots)
354364

355-
def _generate_patterns(self, intent_utterances, entity_placeholders):
365+
def _generate_patterns(self, intent, intent_utterances,
366+
entity_placeholders):
356367
unique_patterns = set()
357368
patterns = []
369+
stop_words = self._get_intent_stop_words(intent)
358370
for utterance in intent_utterances:
359371
pattern = self._utterance_to_pattern(
360-
utterance, entity_placeholders)
372+
utterance, stop_words, entity_placeholders)
361373
if pattern not in unique_patterns:
362374
unique_patterns.add(pattern)
363375
patterns.append(pattern)
364376
return patterns
365377

366-
def _utterance_to_pattern(self, utterance, entity_placeholders):
378+
def _utterance_to_pattern(self, utterance, stop_words,
379+
entity_placeholders):
367380
slot_names_count = defaultdict(int)
368381
pattern = []
369382
for chunk in utterance[DATA]:
@@ -379,7 +392,7 @@ def _utterance_to_pattern(self, utterance, entity_placeholders):
379392
else:
380393
tokens = tokenize_light(chunk[TEXT], self.language)
381394
pattern += [regex_escape(t.lower()) for t in tokens
382-
if normalize(t) not in self.stop_words]
395+
if normalize(t) not in stop_words]
383396

384397
pattern = r"^%s%s%s$" % (WHITESPACE_PATTERN,
385398
WHITESPACE_PATTERN.join(pattern),
@@ -417,12 +430,18 @@ def from_path(cls, path, **shared):
417430

418431
def to_dict(self):
419432
"""Returns a json-serializable dict"""
433+
stop_words_whitelist = None
434+
if self._stop_words_whitelist is not None:
435+
stop_words_whitelist = {
436+
intent: sorted(values)
437+
for intent, values in iteritems(self._stop_words_whitelist)}
420438
return {
421439
"config": self.config.to_dict(),
422440
"language_code": self.language,
423441
"patterns": self.patterns,
424442
"group_names_to_slot_names": self.group_names_to_slot_names,
425-
"slot_names_to_entities": self.slot_names_to_entities
443+
"slot_names_to_entities": self.slot_names_to_entities,
444+
"stop_words_whitelist": stop_words_whitelist
426445
}
427446

428447
@classmethod
@@ -439,6 +458,12 @@ def from_dict(cls, unit_dict, **shared):
439458
parser.group_names_to_slot_names = unit_dict[
440459
"group_names_to_slot_names"]
441460
parser.slot_names_to_entities = unit_dict["slot_names_to_entities"]
461+
if parser.fitted:
462+
whitelist = unit_dict.get("stop_words_whitelist", dict())
463+
# pylint:disable=protected-access
464+
parser._stop_words_whitelist = {
465+
intent: set(values) for intent, values in iteritems(whitelist)}
466+
# pylint:enable=protected-access
442467
return parser
443468

444469

@@ -487,3 +512,14 @@ def sort_key_fn(slot):
487512
def _get_entity_name_placeholder(entity_label, language):
488513
return "%%%s%%" % "".join(
489514
tokenize_light(entity_label, language)).upper()
515+
516+
517+
def _get_stop_words_whitelist(dataset, stop_words):
518+
entity_values_per_intent = extract_entity_values(
519+
dataset, apply_normalization=True)
520+
stop_words_whitelist = dict()
521+
for intent, entity_values in iteritems(entity_values_per_intent):
522+
whitelist = stop_words.intersection(entity_values)
523+
if whitelist:
524+
stop_words_whitelist[intent] = whitelist
525+
return stop_words_whitelist

snips_nlu/nlu_engine/nlu_engine.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,11 @@ def parse(self, text, intents=None, top_n=None):
163163
elif isinstance(intents, list):
164164
intents = set(intents)
165165

166+
if intents is not None:
167+
for intent in intents:
168+
if intent not in self.dataset_metadata["slot_name_mappings"]:
169+
raise IntentNotFoundError(intent)
170+
166171
if top_n is None:
167172
none_proba = 0.0
168173
for parser in self.intent_parsers:
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from __future__ import unicode_literals
2+
3+
import io
4+
from unittest import TestCase
5+
6+
from snips_nlu.dataset import Dataset, validate_and_format_dataset
7+
from snips_nlu.dataset.utils import extract_entity_values
8+
9+
10+
class TestDatasetUtils(TestCase):
11+
def test_should_extract_entity_values(self):
12+
# Given
13+
set_light_color_yaml = io.StringIO("""
14+
---
15+
type: intent
16+
name: setLightColor
17+
utterances:
18+
- set the lights to [color](blue)
19+
- change the light to [color](yellow) in the [room](bedroom)""")
20+
21+
turn_light_on_yaml = io.StringIO("""
22+
---
23+
type: intent
24+
name: turnLightOn
25+
utterances:
26+
- turn the light on in the [room](kitchen)
27+
- turn the [room](bathroom)'s lights on""")
28+
29+
color_yaml = io.StringIO("""
30+
type: entity
31+
name: color
32+
values:
33+
- [blue, cyan]
34+
- red""")
35+
36+
room_yaml = io.StringIO("""
37+
type: entity
38+
name: room
39+
values:
40+
- garage
41+
- [living room, main room]""")
42+
43+
dataset_files = [set_light_color_yaml, turn_light_on_yaml, color_yaml,
44+
room_yaml]
45+
dataset = Dataset.from_yaml_files("en", dataset_files).json
46+
dataset = validate_and_format_dataset(dataset)
47+
48+
# When
49+
entity_values = extract_entity_values(dataset,
50+
apply_normalization=True)
51+
52+
# Then
53+
expected_values = {
54+
"setLightColor": {"blue", "yellow", "cyan", "red", "bedroom",
55+
"garage", "living room", "main room", "kitchen",
56+
"bathroom"},
57+
"turnLightOn": {"bedroom", "garage", "living room", "main room",
58+
"kitchen", "bathroom"}
59+
}
60+
self.assertDictEqual(expected_values, entity_values)

0 commit comments

Comments
 (0)