replacy
Advanced tools
+1
-1
| Metadata-Version: 2.1 | ||
| Name: replacy | ||
| Version: 3.6.1 | ||
| Version: 2.1.0 | ||
| Summary: ReplaCy = spaCy Matcher + pyInflect. Create rules, correct sentences. | ||
@@ -5,0 +5,0 @@ License: MIT |
+4
-4
| [tool.poetry] | ||
| name = "replacy" | ||
| version = "3.6.1" | ||
| name = "replaCy" | ||
| version = "2.1.0" | ||
| description = "ReplaCy = spaCy Matcher + pyInflect. Create rules, correct sentences." | ||
@@ -26,4 +26,4 @@ authors = [ | ||
| pytest = "^5.3.2" | ||
| spacy= "^3.0.6" | ||
| en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz" } | ||
| spacy= "2.2.0" | ||
| en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz" } | ||
| kenlm = { git = "https://github.com/kpu/kenlm", rev = "master" } |
+51
-59
@@ -126,3 +126,2 @@ import copy | ||
| self.matcher = Matcher(self.nlp.vocab) | ||
| self.predicates = {} | ||
| self._init_matcher() | ||
@@ -159,13 +158,9 @@ self.spans: List[Span] = [] | ||
| match_hooks = ps.get("match_hook", []) | ||
| self.predicates[match_name] = get_predicates( | ||
| match_hooks, self.default_match_hooks, self.custom_match_hooks | ||
| ) | ||
| self.matcher.add(match_name, patterns) | ||
| callback = self._get_callback(match_name, match_hooks) | ||
| self._add_matcher_rule(match_name, patterns, callback) | ||
| @staticmethod | ||
| def _fix_alignment_multiple_whitespaces(alignments): | ||
| return [int(a / 2) for a in alignments] | ||
| def _add_matcher_rule(self, match_name, patterns, callback): | ||
| self.matcher.add(match_name, callback, patterns) | ||
| @staticmethod | ||
| def _allow_multiple_whitespaces(patterns): | ||
| def _allow_multiple_whitespaces(self, patterns): | ||
| """ | ||
@@ -178,56 +173,60 @@ allow matching tokens separated by multiple whitespaces | ||
| """ | ||
| if True: | ||
| if self.allow_multiple_whitespaces: | ||
| white_pattern = {"IS_SPACE": True, "OP": "?"} | ||
| normalized_patterns = [] | ||
| for pattern in patterns: | ||
| normalized_pattern = [white_pattern] | ||
| for p in pattern: | ||
| normalized_pattern += [p, white_pattern] | ||
| normalized_patterns.append(normalized_pattern) | ||
| normalized_patterns = [white_pattern] | ||
| for p in patterns: | ||
| normalized_patterns += [p, white_pattern] | ||
| patterns = normalized_patterns | ||
| return patterns | ||
| @staticmethod | ||
| def _remove_unsupported(patterns): | ||
| def _remove_unsupported(self, patterns): | ||
| # remove custom attributes not supported by spaCy Matcher | ||
| for pattern in patterns: | ||
| for p in pattern: | ||
| if "TEMPLATE_ID" in p: | ||
| del p["TEMPLATE_ID"] | ||
| for p in patterns: | ||
| if "TEMPLATE_ID" in p: | ||
| del p["TEMPLATE_ID"] | ||
| return patterns | ||
| def _callback(self, doc, match): | ||
| match_id, start, end, alignments = match | ||
| alignments = ReplaceMatcher._fix_alignment_multiple_whitespaces(alignments) | ||
| def _get_callback(self, match_name, match_hooks): | ||
| """ | ||
| Most matches have the same logic to be executed each time a match is found | ||
| Some matches have extra logic, defined in match_hooks | ||
| """ | ||
| # Get predicates once, callback is returned in a closure with this information | ||
| predicates = get_predicates( | ||
| match_hooks, self.default_match_hooks, self.custom_match_hooks | ||
| ) | ||
| match_name = self.nlp.vocab[match_id].text | ||
| def cb(matcher, doc, i, matches): | ||
| match_id, start, end = matches[i] | ||
| for pred in self.predicates[match_name]: | ||
| try: | ||
| if pred(doc, start, end): | ||
| return None | ||
| except IndexError: | ||
| break | ||
| for pred in predicates: | ||
| try: | ||
| if pred(doc, start, end): | ||
| return None | ||
| except IndexError: | ||
| break | ||
| match_name = self.nlp.vocab[match_id].text | ||
| span = self.Span(doc, start, end) | ||
| span = self.Span(doc, start, end) | ||
| # find in match_dict if needed | ||
| span._.match_name = match_name | ||
| # find in match_dict if needed | ||
| span._.match_name = match_name | ||
| pre_suggestions = self.match_dict[match_name]["suggestions"] | ||
| pre_suggestions = self.match_dict[match_name]["suggestions"] | ||
| span._.suggestions = [] | ||
| span._.suggestions = [] | ||
| for i, x in enumerate(pre_suggestions): | ||
| span._.suggestions += self.process_suggestions( | ||
| x, doc, start, end, match_name, i | ||
| ) | ||
| for i, x in enumerate(pre_suggestions): | ||
| span._.suggestions += self.process_suggestions( | ||
| x, doc, start, end, match_name, i, alignments | ||
| ) | ||
| for novel_prop, default_value in self.novel_prop_defaults.items(): | ||
| setattr( | ||
| span._, | ||
| novel_prop, | ||
| self.match_dict[match_name].get(novel_prop, default_value), | ||
| ) | ||
| self.spans.append(span) | ||
| for novel_prop, default_value in self.novel_prop_defaults.items(): | ||
| setattr( | ||
| span._, | ||
| novel_prop, | ||
| self.match_dict[match_name].get(novel_prop, default_value), | ||
| ) | ||
| self.spans.append(span) | ||
| return cb | ||
@@ -279,3 +278,3 @@ def _set_scorer(self, lm_path): | ||
| def process_suggestions( | ||
| self, pre_suggestion, doc, start, end, match_name, pre_suggestion_id, alignments | ||
| self, pre_suggestion, doc, start, end, match_name, pre_suggestion_id | ||
| ): | ||
@@ -286,3 +285,3 @@ # get token <-> pattern correspondence | ||
| suggestion_variants = self.suggestion_gen( | ||
| pre_suggestion, doc, start, end, pattern, pre_suggestion_id, alignments | ||
| pre_suggestion, doc, start, end, pattern, pre_suggestion_id | ||
| ) | ||
@@ -386,10 +385,3 @@ # assert there aren't more than max_suggestions_count | ||
| # this fills up self.spans | ||
| matches = self.matcher(doc, with_alignments=True) | ||
| # do the callback here instead of to pass it as callback on match | ||
| # here we alignment information to use for pattern ref | ||
| # we don't have this info on match callback | ||
| for match in matches: | ||
| self._callback(doc, match) | ||
| self.matcher(doc) | ||
| for _, component in self.pipeline: | ||
@@ -396,0 +388,0 @@ # the default pipeline will: |
+0
-1
@@ -72,3 +72,2 @@ import json | ||
| import kenlm | ||
| return kenlm.Model(model_path) |
@@ -181,8 +181,6 @@ """ | ||
| def relative_x_is_y( | ||
| children_or_ancestors: str, pos_or_dep: str, value: Union[str, List[str]] | ||
| ) -> SpacyMatchPredicate: | ||
| """ | ||
| This hook looks at all the tokens in a matched span to determine | ||
| whether any of the children or the first ancestor have a given .pos_ or | ||
| def relative_x_is_y(children_or_ancestors: str, pos_or_dep: str, value: Union[str, List[str]]) -> SpacyMatchPredicate: | ||
| ''' | ||
| This hook looks at all the tokens in a matched span to determine | ||
| whether any of the children or the first ancestor have a given .pos_ or | ||
| .dep_. This replaces the implementation of the Dependency Matcher in | ||
@@ -202,3 +200,3 @@ the previous version by looking at token.children or token.ancestors in | ||
| } | ||
| """ | ||
| ''' | ||
@@ -215,8 +213,6 @@ if not isinstance(value, list): | ||
| if children_or_ancestors not in ["children", "ancestors"]: | ||
| raise ValueError( | ||
| "children_or_ancestors must be set to either `children` or `ancestors`" | ||
| ) | ||
| raise ValueError("children_or_ancestors must be set to either `children` or `ancestors`") | ||
| if pos_or_dep not in ["pos", "dep", "tag"]: | ||
| raise ValueError("pos_or_dep must be set to either `pos`, `dep`, or `tag`!") | ||
| if pos_or_dep not in ["pos", "dep"]: | ||
| raise ValueError("pos_or_dep must be set to either `pos` or `dep`!") | ||
@@ -229,14 +225,6 @@ def _in_children(doc, start, end): | ||
| if pos_or_dep == "pos": | ||
| return any( | ||
| [child.pos_ == val for tok in match_span for child in tok.children] | ||
| ) | ||
| return any([child.pos_ == val for tok in match_span for child in tok.children]) | ||
| elif pos_or_dep == "dep": | ||
| return any( | ||
| [child.dep_ == val for tok in match_span for child in tok.children] | ||
| ) | ||
| elif pos_or_dep == "tag": | ||
| return any( | ||
| [child.tag_ == val for tok in match_span for child in tok.children] | ||
| ) | ||
| return any([child.dep_ == val for tok in match_span for child in tok.children]) | ||
| def _in_ancestors(doc, start, end): | ||
@@ -259,8 +247,2 @@ if end >= len(doc): | ||
| return False | ||
| if pos_or_dep == "tag": | ||
| for t in match_span: | ||
| ancestor = list(t.ancestors)[0] if len(list(t.ancestors)) else None | ||
| if ancestor and ancestor.tag_ == val: | ||
| return True | ||
| return False | ||
@@ -267,0 +249,0 @@ if children_or_ancestors == "children": |
@@ -12,3 +12,3 @@ from typing import List | ||
| for suggestion in span.suggestions: | ||
| if (span.doc[span.start : span.end].text) == suggestion: | ||
| if (span.doc[span.start:span.end].text) == suggestion: | ||
| continue | ||
@@ -29,3 +29,3 @@ suggestions.append(suggestion) | ||
| if len(span.suggestions): | ||
| span_text = span.doc[span.start : span.end].text.rstrip(" \r\n") | ||
| span_text = span.doc[span.start:span.end].text.rstrip(" \r\n") | ||
| suggestions = [] | ||
@@ -42,2 +42,2 @@ for suggestion in span.suggestions: | ||
| filtered_spans.append(span) | ||
| return filtered_spans | ||
| return filtered_spans |
+28
-0
@@ -202,1 +202,29 @@ import warnings | ||
| return self.inflect_string(word, tag=tag, pos=pos) | ||
| def insert(self, doc, suggestion: str, index: int): | ||
| """ | ||
| Returns the sentence with inserted inflected token. | ||
| If inflection is not supported - returns the original sentence. | ||
| ex. She washed her eggs. -> She ate her eggs. | ||
| If many inflections returned, take the first form. | ||
| """ | ||
| # if string passed, conversion to doc | ||
| try: | ||
| doc.text | ||
| except AttributeError: | ||
| doc = self.nlp(doc) | ||
| infl_tokens = self.auto_inflect(doc, suggestion, index) | ||
| if len(infl_tokens): | ||
| infl_token = infl_tokens[0] | ||
| if infl_token: | ||
| token = doc[index] | ||
| changed_sent = "".join( | ||
| [doc.text[: token.idx], infl_token, doc.text[token.idx + len(token) :],] | ||
| ) | ||
| return changed_sent | ||
| else: | ||
| return doc.text |
+171
-11
@@ -7,12 +7,172 @@ import copy | ||
| class RefMatcher: | ||
| def __call__(self, span, orig_pattern, alignments): | ||
| # not all parameters are needed, adding it to have same signature as RefMatcher | ||
| pattern_indexes = set(alignments) | ||
| return { | ||
| pattern_idx: [ | ||
| span_token_idx | ||
| for span_token_idx, pattern_index in enumerate(alignments) | ||
| if pattern_index == pattern_idx | ||
| ] | ||
| for pattern_idx in pattern_indexes | ||
| } | ||
| def __init__(self, nlp): | ||
| self.nlp = nlp | ||
| self.matcher = Matcher(nlp.vocab) | ||
| def clean_matcher(self): | ||
| # no native method to clean spaCy matcher | ||
| # or retrieve pattern names | ||
| # so always add ints, starting from zero | ||
| # and clean ints from 0 till not found | ||
| i = 0 | ||
| while len(self.matcher) > 0 and i < 100: | ||
| if i in self.matcher: | ||
| self.matcher.remove(i) | ||
| i += 1 | ||
| @staticmethod | ||
| def is_negative(p): | ||
| if "OP" in p and p["OP"] == "!": | ||
| return True | ||
| return False | ||
| @staticmethod | ||
| def is_droppable(p): | ||
| if "OP" in p and p["OP"] in ["*", "?"]: | ||
| return True | ||
| return False | ||
| @staticmethod | ||
| def is_multitoken(p): | ||
| if "OP" in p and p["OP"] in ["*", "+"]: | ||
| return True | ||
| return False | ||
| def remove_skipped_ops(self, span, pattern): | ||
| skipped_idx = [] | ||
| op_tokens = [i for (i, p) in enumerate(pattern) if RefMatcher.is_droppable(p)] | ||
| for op in op_tokens: | ||
| op_pattern = copy.deepcopy(pattern) | ||
| # remove "?" to require 1 instead of 0 | ||
| if op_pattern[op]["OP"] == "?": | ||
| if len(op_pattern[op]) == 1: | ||
| # if no more props, | ||
| # add dummy string that will never match | ||
| # since its not 1 token :) | ||
| op_pattern[op]["TEXT"] = "alice and bob" | ||
| op_pattern[op]["OP"] = "!" | ||
| del op_pattern[op]["OP"] | ||
| # change "*" to "+", to require 1+ instead of 0+ | ||
| elif op_pattern[op]["OP"] == "*": | ||
| op_pattern[op]["OP"] = "+" | ||
| self.matcher.add(op, None, op_pattern) | ||
| # check whether it still matches | ||
| matches = self.matcher(span.as_doc()) | ||
| max_matches = [m for (m, s, e) in matches if (s == 0) and (e == len(span))] | ||
| # clean the matcher | ||
| self.clean_matcher() | ||
| non_op_pattern = [] | ||
| for i, p in enumerate(pattern): | ||
| # is optional | ||
| if "OP" in p: | ||
| # but not found | ||
| if not i in max_matches and not RefMatcher.is_negative(p): | ||
| # => to do marked non matched, skip | ||
| skipped_idx.append(i) | ||
| continue | ||
| else: | ||
| if p["OP"] == "+": | ||
| if len(p) == 1: | ||
| # if no more props, | ||
| # add dummy string that will never match | ||
| # since its not 1 token :) | ||
| p["TEXT"] = "alice and bob" | ||
| p["OP"] = "!" | ||
| else: | ||
| del p["OP"] | ||
| elif p["OP"] == "*": | ||
| p["OP"] = "+" | ||
| non_op_pattern.append(p) | ||
| return non_op_pattern, skipped_idx | ||
| def insert_empty_idx(self, pattern_ref, idx): | ||
| pattern_ref_insert = {} | ||
| for p, v in pattern_ref.items(): | ||
| if p >= idx: | ||
| pattern_ref_insert[p + 1] = v | ||
| else: | ||
| pattern_ref_insert[p] = v | ||
| pattern_ref_insert[idx] = [] | ||
| return pattern_ref_insert | ||
| def shift_pattern_ref(self, pattern_ref, skipped_idx): | ||
| for idx in skipped_idx: | ||
| pattern_ref = self.insert_empty_idx(pattern_ref, idx) | ||
| return pattern_ref | ||
| def __call__(self, span, orig_pattern): | ||
| pattern = copy.deepcopy(orig_pattern) | ||
| # remove props not supported by SpaCy matcher: | ||
| for p in pattern: | ||
| if "TEMPLATE_ID" in p: | ||
| del p["TEMPLATE_ID"] | ||
| # case I: tokens <-> patterns | ||
| # if lengths match | ||
| # if no OP | ||
| # => everything has been matched | ||
| if len(span) == len(pattern) and not any(["OP" in p for p in pattern]): | ||
| return {k: [k] for k in range(len(pattern))} | ||
| # check which tokens are matched, remove non matched | ||
| non_op_pattern, skipped_idx = self.remove_skipped_ops(span, pattern) | ||
| # case II: | ||
| # if lengths match | ||
| # if no multitoken OPs | ||
| # => everything has been matched | ||
| if len(span) == len(non_op_pattern) and not any( | ||
| [RefMatcher.is_multitoken(p) for p in non_op_pattern] | ||
| ): | ||
| pattern_ref = {k: [k] for k in range(len(non_op_pattern))} | ||
| return self.shift_pattern_ref(pattern_ref, skipped_idx) | ||
| # case III: | ||
| # worst case | ||
| # get shifts for multitokens | ||
| # ie rematching cropped spans and patterns | ||
| # A. get cropped patterns | ||
| for i in range(len(non_op_pattern)): | ||
| self.matcher.add(i, None, non_op_pattern[i:]) | ||
| # B. get cropped spans | ||
| docs = [span[i:].as_doc() for i in range(len(span))] | ||
| # C. rematch | ||
| matches = self.matcher.pipe(docs, batch_size=len(span), return_matches=True) | ||
| # D. get pattern_ref | ||
| pattern_ref = {} | ||
| for i, (d, m) in enumerate(matches): | ||
| # take max span match for doc | ||
| if len(m): | ||
| # len 0 shouldn't happen except weird white spaces | ||
| m_id, m_start, m_end = max(m, key=lambda x: x[2] - x[1]) | ||
| # if cropped span matches cropped pattern | ||
| # 1st token of cropped span belongs to 1st cropped pattern item | ||
| if not m_id in pattern_ref: | ||
| pattern_ref[m_id] = [i] | ||
| else: | ||
| # no changes in pattern | ||
| # pattern item had more tokens matched | ||
| # ex. "very fast ..." & "fast ... " | ||
| # matched with {"POS": "ADJ", "OP": "+"} ... | ||
| pattern_ref[m_id].append(i) | ||
| # clean | ||
| self.clean_matcher() | ||
| # shift by skipped ops | ||
| pattern_ref = self.shift_pattern_ref(pattern_ref, skipped_idx) | ||
| return pattern_ref |
@@ -5,3 +5,3 @@ { | ||
| "definitions": { | ||
| "replacyAttributeItem": { | ||
| "replacyAttribute": { | ||
| "type": "object", | ||
@@ -57,9 +57,2 @@ "properties": { | ||
| }, | ||
| "replacyAttribute": { | ||
| "type": "array", | ||
| "items": { | ||
| "$ref": "#/definitions/replacyAttributeItem" | ||
| }, | ||
| "minItems": 1 | ||
| }, | ||
| "spacyOperator": { | ||
@@ -109,3 +102,3 @@ "type": "object", | ||
| }, | ||
| "textOperator": { | ||
| "textOperator":{ | ||
| "type": "object", | ||
@@ -149,53 +142,38 @@ "additionalProperties": false, | ||
| "properties": { | ||
| "TEXT": { | ||
| "$ref": "#/definitions/textValue" | ||
| }, | ||
| "FROM_TEMPLATE_ID": { | ||
| "type": "integer" | ||
| }, | ||
| "PATTERN_REF": { | ||
| "type": "integer" | ||
| }, | ||
| "REPLACY_OP": { | ||
| "enum": ["LOWER", "UPPER", "TITLE"] | ||
| }, | ||
| "INFLECTION": { | ||
| "enum": [ | ||
| "ADJ", | ||
| "ADV", | ||
| "PROPN", | ||
| "VERB", | ||
| "AUX", | ||
| "JJ", | ||
| "JJR", | ||
| "JJS", | ||
| "RB", | ||
| "RBR", | ||
| "RBS", | ||
| "NN", | ||
| "NNS", | ||
| "NNP", | ||
| "NNPS", | ||
| "VB", | ||
| "VBD", | ||
| "VBG", | ||
| "VBN", | ||
| "VBP", | ||
| "VBZ", | ||
| "MD", | ||
| "ALL" | ||
| ] | ||
| }, | ||
| "REGEX": { | ||
| "type": "string", | ||
| "minLength": 2 | ||
| }, | ||
| "SUFFIX": { | ||
| "type": "string", | ||
| "minLength": 1 | ||
| }, | ||
| "PREFIX": { | ||
| "type": "string", | ||
| "minLength": 1 | ||
| } | ||
| "TEXT": { | ||
| "$ref": "#/definitions/textValue" | ||
| }, | ||
| "FROM_TEMPLATE_ID": { | ||
| "type": "integer" | ||
| }, | ||
| "PATTERN_REF": { | ||
| "type": "integer" | ||
| }, | ||
| "INFLECTION": { | ||
| "enum": [ | ||
| "ADJ", | ||
| "ADV", | ||
| "PROPN", | ||
| "VERB", | ||
| "AUX", | ||
| "JJ", | ||
| "JJR", | ||
| "JJS", | ||
| "RB", | ||
| "RBR", | ||
| "RBS", | ||
| "NN", | ||
| "NNS", | ||
| "NNP", | ||
| "NNPS", | ||
| "VB", | ||
| "VBD", | ||
| "VBG", | ||
| "VBN", | ||
| "VBP", | ||
| "VBZ", | ||
| "MD", | ||
| "ALL" | ||
| ] | ||
| } | ||
| } | ||
@@ -245,3 +223,3 @@ }, | ||
| "patternProperties": { | ||
| "^[a-z_-][A-Za-z0-9_-]*$": { | ||
| "^[a-z_][A-Za-z0-9_]*$": { | ||
| "type": "object", | ||
@@ -248,0 +226,0 @@ "properties": { |
| { | ||
| "extract-revenge": { | ||
| "patterns": [ | ||
| [ | ||
| { | ||
| "LEMMA": "extract", | ||
| "TEMPLATE_ID": 1 | ||
| } | ||
| ] | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "TEXT": "exact", | ||
| "FROM_TEMPLATE_ID": 1 | ||
| } | ||
| ] | ||
| ], | ||
| "match_hook": [ | ||
| { | ||
| "name": "succeeded_by_phrase", | ||
| "args": "revenge", | ||
| "match_if_predicate_is": true | ||
| } | ||
| ], | ||
| "test": { | ||
| "positive": [ | ||
| "And at the same time extract revenge on those he so despises?", | ||
| "Watch as Tampa Bay extracts revenge against his former Los Angeles Rams team." | ||
| ], | ||
| "negative": [ | ||
| "Mother flavours her custards with lemon extract." | ||
| ] | ||
| "extract-revenge": { | ||
| "patterns": [ | ||
| { | ||
| "LEMMA": "extract", | ||
| "TEMPLATE_ID": 1 | ||
| } | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "TEXT": "exact", | ||
| "FROM_TEMPLATE_ID": 1 | ||
| } | ||
| }, | ||
| "make-due": { | ||
| "patterns": [ | ||
| [ | ||
| { | ||
| "LEMMA": "make", | ||
| "TEMPLATE_ID": 1 | ||
| }, | ||
| { | ||
| "LOWER": "due" | ||
| } | ||
| ] | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "TEXT": "make", | ||
| "FROM_TEMPLATE_ID": 1 | ||
| }, | ||
| { | ||
| "TEXT": "do" | ||
| } | ||
| ] | ||
| ], | ||
| "test": { | ||
| "positive": [ | ||
| "Viewers will have to make due with tired re-runs and second-rate movies." | ||
| ], | ||
| "negative": [ | ||
| "The empty vessels make the greatest sound.", | ||
| "I'll make do.", | ||
| "She only has sons; she'll make dudes." | ||
| ] | ||
| ] | ||
| ], | ||
| "match_hook": [ | ||
| { | ||
| "name": "succeeded_by_phrase", | ||
| "args": "revenge", | ||
| "match_if_predicate_is": true | ||
| } | ||
| ], | ||
| "test": { | ||
| "positive": [ | ||
| "And at the same time extract revenge on those he so despises?", | ||
| "Watch as Tampa Bay extracts revenge against his former Los Angeles Rams team." | ||
| ], | ||
| "negative": ["Mother flavours her custards with lemon extract."] | ||
| } | ||
| }, | ||
| "make-due": { | ||
| "patterns": [ | ||
| { | ||
| "LEMMA": "make", | ||
| "TEMPLATE_ID": 1 | ||
| }, | ||
| { | ||
| "LOWER": "due" | ||
| } | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "TEXT": "make", | ||
| "FROM_TEMPLATE_ID": 1 | ||
| }, | ||
| "comment": "this is an example comment", | ||
| "description": "The expression is \"make do\".", | ||
| "category": "R:VERB", | ||
| "unexpected": "replaCy should handle arbitrary properties here, and attach them to the relevant spans" | ||
| }, | ||
| "requirement": { | ||
| "patterns": [ | ||
| [ | ||
| { | ||
| "LEMMA": "requirement", | ||
| "POS": "NOUN", | ||
| "TEMPLATE_ID": 1 | ||
| } | ||
| ] | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "TEXT": "need", | ||
| "FROM_TEMPLATE_ID": 1 | ||
| } | ||
| ] | ||
| ], | ||
| "match_hook": [ | ||
| { | ||
| "name": "part_of_compound", | ||
| "match_if_predicate_is": false | ||
| }, | ||
| { | ||
| "name": "preceded_by_lemma", | ||
| "kwargs": { | ||
| "lemma": "hello", | ||
| "distance": 22 | ||
| }, | ||
| "match_if_predicate_is": false | ||
| } | ||
| ], | ||
| "test": { | ||
| "positive": [ | ||
| "The system has the following requirements: blood of a virgin, suffering, and cat food.", | ||
| "Our immediate requirement is extra staff." | ||
| ], | ||
| "negative": [ | ||
| "There is a residency requirement for obtaining citizenship.", | ||
| "What is the minimum entrance requirement for this course?" | ||
| ] | ||
| { | ||
| "TEXT": "do" | ||
| } | ||
| ] | ||
| ], | ||
| "test": { | ||
| "positive": [ | ||
| "Viewers will have to make due with tired re-runs and second-rate movies." | ||
| ], | ||
| "negative": [ | ||
| "The empty vessels make the greatest sound.", | ||
| "I'll make do.", | ||
| "She only has sons; she'll make dudes." | ||
| ] | ||
| }, | ||
| "lt-example": { | ||
| "patterns": [ | ||
| [ | ||
| { | ||
| "LOWER": { | ||
| "IN": [ | ||
| "have", | ||
| "has" | ||
| ] | ||
| } | ||
| }, | ||
| { | ||
| "TAG": { | ||
| "IN": [ | ||
| "VBD", | ||
| "VBP", | ||
| "VB", | ||
| "VBN" | ||
| ] | ||
| } | ||
| }, | ||
| { | ||
| "TAG": { | ||
| "NOT_IN": [ | ||
| "VBG" | ||
| ] | ||
| } | ||
| } | ||
| ] | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "PATTERN_REF": 0 | ||
| }, | ||
| { | ||
| "PATTERN_REF": 1, | ||
| "INFLECTION": "VBN" | ||
| }, | ||
| { | ||
| "PATTERN_REF": 2 | ||
| } | ||
| ] | ||
| ], | ||
| "description": "Possible agreement error -- use past participle here", | ||
| "test": { | ||
| "positive": [ | ||
| "I have eat this" | ||
| ], | ||
| "negative": [ | ||
| "I ate this" | ||
| ] | ||
| "comment": "this is an example comment", | ||
| "description": "The expression is \"make do\".", | ||
| "category": "R:VERB", | ||
| "unexpected": "replaCy should handle arbitrary properties here, and attach them to the relevant spans" | ||
| }, | ||
| "requirement": { | ||
| "patterns": [ | ||
| { | ||
| "LEMMA": "requirement", | ||
| "POS": "NOUN", | ||
| "TEMPLATE_ID": 1 | ||
| } | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "TEXT": "need", | ||
| "FROM_TEMPLATE_ID": 1 | ||
| } | ||
| }, | ||
| "assemble_attach_together": { | ||
| "comment": "Match the word together if it is a modifier of any form of assemble or attach, and suggest removing it", | ||
| "patterns": [ | ||
| [ | ||
| { | ||
| "LOWER": "together" | ||
| } | ||
| ] | ||
| ], | ||
| "match_hook": [ | ||
| { | ||
| "name": "relative_x_is_y", | ||
| "kwargs": { | ||
| "children_or_ancestors": "ancestors", | ||
| "pos_or_dep": "dep", | ||
| "value": "ROOT" | ||
| }, | ||
| "match_if_predicate_is": true | ||
| } | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "TEXT": "" | ||
| } | ||
| ] | ||
| ], | ||
| "test": { | ||
| "positive": [ | ||
| "Avengers, assemble the team together!", | ||
| "We assembled the furniture together." | ||
| ], | ||
| "negative": [ | ||
| "After we assemble, we can go together", | ||
| "My arm is attached to my shoulder, I like that they are together." | ||
| ] | ||
| ] | ||
| ], | ||
| "match_hook": [ | ||
| { | ||
| "name": "part_of_compound", | ||
| "match_if_predicate_is": false | ||
| }, | ||
| { | ||
| "name": "preceded_by_lemma", | ||
| "kwargs": { | ||
| "lemma": "hello", | ||
| "distance": 22 | ||
| }, | ||
| "match_if_predicate_is": false | ||
| } | ||
| ], | ||
| "test": { | ||
| "positive": [ | ||
| "The system has the following requirements: blood of a virgin, suffering, and cat food.", | ||
| "Our immediate requirement is extra staff." | ||
| ], | ||
| "negative": [ | ||
| "There is a residency requirement for obtaining citizenship.", | ||
| "What is the minimum entrance requirement for this course?" | ||
| ] | ||
| } | ||
| }, | ||
| "lt-example": { | ||
| "patterns": [ | ||
| { | ||
| "LOWER": { | ||
| "IN": ["have", "has"] | ||
| } | ||
| }, | ||
| "effective_in_its_ability": { | ||
| "patterns": [ | ||
| [ | ||
| { | ||
| "LEMMA": "be", | ||
| "TEMPLATE_ID": 1 | ||
| }, | ||
| { | ||
| "LOWER": "effective" | ||
| }, | ||
| { | ||
| "LOWER": "in" | ||
| }, | ||
| { | ||
| "DEP": "poss" | ||
| }, | ||
| { | ||
| "LOWER": "ability" | ||
| }, | ||
| { | ||
| "LOWER": "to" | ||
| }, | ||
| { | ||
| "POS": "VERB" | ||
| } | ||
| ] | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "TEXT": "effectively" | ||
| }, | ||
| { | ||
| "PATTERN_REF": 6, | ||
| "FROM_TEMPLATE_ID": 1 | ||
| } | ||
| ] | ||
| ], | ||
| "comment": "You can use pattern_ref and from_template_id together", | ||
| "test": { | ||
| "positive": [ | ||
| "The pail was effective in its ability to carry water" | ||
| ], | ||
| "negative": [ | ||
| "The pail wasn't effective in its ability to carry water" | ||
| ] | ||
| }, | ||
| { | ||
| "TAG": { | ||
| "IN": ["VBD", "VBP", "VB"] | ||
| } | ||
| }, | ||
| "dupe-test": { | ||
| "patterns": [ | ||
| [ | ||
| { | ||
| "LEMMA": "make", | ||
| "TEMPLATE_ID": 1 | ||
| } | ||
| ] | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "TEXT": "build", | ||
| "FROM_TEMPLATE_ID": 1 | ||
| } | ||
| ] | ||
| ], | ||
| "comment": "This is a bad match, it is here to demonstrate overlap behavior", | ||
| "test": { | ||
| "positive": [ | ||
| "I will make something" | ||
| ], | ||
| "negative": [ | ||
| "I will build something" | ||
| ] | ||
| }, | ||
| { | ||
| "TAG": { | ||
| "NOT_IN": ["VBG"] | ||
| } | ||
| }, | ||
| "all-caps": { | ||
| "patterns": [ | ||
| [ | ||
| { | ||
| "IS_UPPER": true, | ||
| "TEXT": { | ||
| "REGEX": "^[A-Z]{2,}$" | ||
| }, | ||
| "OP": "+" | ||
| }, | ||
| { | ||
| "IS_LOWER": true, | ||
| "OP": "*" | ||
| } | ||
| ] | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "PATTERN_REF": 0, | ||
| "REPLACY_OP": "LOWER" | ||
| }, | ||
| { | ||
| "PATTERN_REF": 1, | ||
| "REPLACY_OP": "UPPER" | ||
| } | ||
| ] | ||
| ], | ||
| "test": { | ||
| "positive": [ | ||
| "TENNIS is a lovely game.", | ||
| "THIS IS SO SILLY", | ||
| "THIS IS SO SILLY waay to go" | ||
| ], | ||
| "negative": [ | ||
| "this is so silly" | ||
| ] | ||
| } | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "PATTERN_REF": 0 | ||
| }, | ||
| { | ||
| "PATTERN_REF": 1, | ||
| "INFLECTION": "VBN" | ||
| }, | ||
| { | ||
| "PATTERN_REF": 2 | ||
| } | ||
| ] | ||
| ], | ||
| "description": "Possible agreement error -- use past participle here", | ||
| "test": { | ||
| "positive": ["I have eat this"], | ||
| "negative": ["I have eaten this"] | ||
| } | ||
| }, | ||
| "assemble_attach_together": { | ||
| "comment": "Match the word together if it is a modifier of any form of assemble or attach, and suggest removing it", | ||
| "patterns": [ | ||
| { | ||
| "LOWER": "together" | ||
| } | ||
| ], | ||
| "match_hook": [ | ||
| { | ||
| "name": "relative_x_is_y", | ||
| "kwargs": { | ||
| "children_or_ancestors": "ancestors", | ||
| "pos_or_dep": "dep", | ||
| "value": "ROOT" | ||
| }, | ||
| "match_if_predicate_is": true | ||
| } | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "TEXT": "" | ||
| } | ||
| ] | ||
| ], | ||
| "test": { | ||
| "positive": [ | ||
| "Avengers, assemble the team together!", | ||
| "We assembled the furniture together." | ||
| ], | ||
| "negative": [ | ||
| "After we assemble, we can go together", | ||
| "My arm is attached to my shoulder, I like that they are together." | ||
| ] | ||
| } | ||
| }, | ||
| "effective_in_its_ability": { | ||
| "patterns": [ | ||
| { | ||
| "LEMMA": "be", | ||
| "TEMPLATE_ID": 1 | ||
| }, | ||
| { | ||
| "LOWER": "effective" | ||
| }, | ||
| { | ||
| "LOWER": "in" | ||
| }, | ||
| { | ||
| "DEP": "poss" | ||
| }, | ||
| { | ||
| "LOWER": "ability" | ||
| }, | ||
| { | ||
| "LOWER": "to" | ||
| }, | ||
| { | ||
| "POS": "VERB" | ||
| } | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "TEXT": "effectively" | ||
| }, | ||
| { | ||
| "PATTERN_REF": 6, | ||
| "FROM_TEMPLATE_ID": 1 | ||
| } | ||
| ] | ||
| ], | ||
| "comment": "You can use pattern_ref and from_template_id together", | ||
| "test": { | ||
| "positive": ["The pail was effective in its ability to carry water"], | ||
| "negative": ["The pail wasn't effective in its ability to carry water"] | ||
| } | ||
| }, | ||
| "dupe-test": { | ||
| "patterns": [ | ||
| { | ||
| "LEMMA": "make", | ||
| "TEMPLATE_ID": 1 | ||
| } | ||
| ], | ||
| "suggestions": [ | ||
| [ | ||
| { | ||
| "TEXT": "build", | ||
| "FROM_TEMPLATE_ID": 1 | ||
| } | ||
| ] | ||
| ], | ||
| "comment": "This is a bad match, it is here to demonstrate overlap behavior", | ||
| "test": { | ||
| "positive": ["I will make something"], | ||
| "negative": ["I will build something"] | ||
| } | ||
| } | ||
| } |
@@ -8,7 +8,3 @@ from typing import List | ||
| for span in spans: | ||
| suggestions_separator = ( | ||
| span.suggestions_separator | ||
| if span.has_extension("suggestions_separator") | ||
| else " " | ||
| ) | ||
| suggestions_separator = span.suggestions_separator if span.has_extension('suggestions_separator') else " " | ||
| suggestions: List[str] = [] | ||
@@ -15,0 +11,0 @@ for s in span._.suggestions: |
+12
-15
@@ -16,3 +16,3 @@ import re | ||
| self.inflector = Inflector(nlp=nlp, forms_lookup=self.forms_lookup) | ||
| self.ref_matcher = RefMatcher() | ||
| self.ref_matcher = RefMatcher(nlp) | ||
| self.filter_suggestions = filter_suggestions | ||
@@ -35,9 +35,9 @@ self.default_max_count = default_max_count | ||
| try: | ||
| refd_text = None | ||
| if ref in pattern_ref: | ||
| refd_tokens = pattern_ref[ref] | ||
| if len(refd_tokens): | ||
| min_i = start + min(refd_tokens) | ||
| max_i = start + max(refd_tokens) | ||
| refd_text = doc[min_i : max_i + 1].text | ||
| refd_tokens = pattern_ref[ref] | ||
| if len(refd_tokens): | ||
| min_i = start + min(refd_tokens) | ||
| max_i = start + max(refd_tokens) | ||
| refd_text = doc[min_i : max_i + 1].text | ||
| else: | ||
| refd_text = None | ||
| except: | ||
@@ -272,5 +272,3 @@ warnings.warn( | ||
| def __call__( | ||
| self, pre_suggestion, doc, start, end, pattern, pre_suggestion_id, alignments | ||
| ): | ||
| def __call__(self, pre_suggestion, doc, start, end, pattern, pre_suggestion_id): | ||
| """ | ||
@@ -295,4 +293,3 @@ Suggestion text: | ||
| # get token <-> pattern correspondence | ||
| pattern_obj = pattern[0] | ||
| pattern_ref = self.ref_matcher(doc[start:end], pattern_obj, alignments) | ||
| pattern_ref = self.ref_matcher(doc[start:end], pattern) | ||
@@ -303,3 +300,3 @@ generated_suggestions_obj = [] | ||
| item_options = SuggestionGenerator.get_options( | ||
| item, doc, start, end, pattern_obj, pattern_ref | ||
| item, doc, start, end, pattern, pattern_ref | ||
| ) | ||
@@ -315,3 +312,3 @@ item["generated"] = item_options | ||
| generated_suggestions_obj, | ||
| pattern_obj, | ||
| pattern, | ||
| pattern_ref, | ||
@@ -318,0 +315,0 @@ doc, |
@@ -11,6 +11,5 @@ import unittest | ||
| class MatchDictTestHelper(unittest.TestCase): | ||
| @staticmethod | ||
| def generate_cases( | ||
| match_dict: Dict[str, Any] | ||
| ) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]: | ||
| def generate_cases(match_dict: Dict[str, Any]) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]: | ||
| positives: List[Tuple[str, str]] = [] | ||
@@ -33,5 +32,3 @@ negatives: List[Tuple[str, str]] = [] | ||
| cls.r_matcher = ReplaceMatcher(nlp, match_dict) | ||
| cls.positive_cases, cls.negative_cases = MatchDictTestHelper.generate_cases( | ||
| match_dict | ||
| ) | ||
| cls.positive_cases, cls.negative_cases = MatchDictTestHelper.generate_cases(match_dict) | ||
@@ -41,5 +38,3 @@ def test_positive(self): | ||
| spans = self.r_matcher(positive_sent) | ||
| spans_from_this_rule = list( | ||
| filter(lambda s: s._.match_name == match_name, spans) | ||
| ) | ||
| spans_from_this_rule = list(filter(lambda s: s._.match_name == match_name, spans)) | ||
| print(match_name, positive_sent) | ||
@@ -51,8 +46,4 @@ assert len(spans_from_this_rule) > 0, "Positive case should trigger rule" | ||
| spans = self.r_matcher(negative_sent) | ||
| spans_from_this_rule = list( | ||
| filter(lambda s: s._.match_name == match_name, spans) | ||
| ) | ||
| spans_from_this_rule = list(filter(lambda s: s._.match_name == match_name, spans)) | ||
| print(match_name, negative_sent) | ||
| assert ( | ||
| len(spans_from_this_rule) == 0 | ||
| ), "Negative case should NOT trigger rule" | ||
| assert len(spans_from_this_rule) == 0, "Negative case should NOT trigger rule" |
+0
-1
| import warnings | ||
| from typing import Any, Callable, Dict, List, Union | ||
| import spacy | ||
| from functional import seq | ||
@@ -6,0 +5,0 @@ from jsonschema import validate |
| # CHANGES HERE HAVE NO EFFECT: ../VERSION is the source of truth | ||
| __version__ = "3.1.0" | ||
| __version__ = "2.0.0" |
+1
-1
@@ -15,3 +15,3 @@ # -*- coding: utf-8 -*- | ||
| 'name': 'replacy', | ||
| 'version': '3.6.1', | ||
| 'version': '2.1.0', | ||
| 'description': 'ReplaCy = spaCy Matcher + pyInflect. Create rules, correct sentences.', | ||
@@ -18,0 +18,0 @@ 'long_description': '<p align="center">\n<img src="./docs/replacy_logo.png" align="center" />\n</p>\n\n# replaCy: match & replace with spaCy\n\nWe found that in multiple projects we had duplicate code for using spaCy’s blazing fast matcher to do the same thing: Match-Replace-Grammaticalize. So we wrote replaCy!\n\n- Match - spaCy’s matcher is great, and lets you match on text, shape, POS, dependency parse, and other features. We extended this with “match hooks”, predicates that get used in the callback function to further refine a match.\n- Replace - Not built into spaCy’s matcher syntax, but easily added. You often want to replace a matched word with some other term.\n- Grammaticalize - If you match on ”LEMMA”: “dance”, and replace with suggestions: ["sing"], but the actual match is danced, you need to conjugate “sing” appropriately. This is the “killer feature” of replaCy\n\n[](https://spacy.io)\n[](https://pypi.org/project/replacy/)\n[](https://github.com/ambv/black)\n\n<p align="center">\n<img src="./docs/replacy_ex.png" align="center" />\n</p>\n\n\n## Requirements\n\n- `spacy >= 2.0` (not installed by default, but replaCy needs to be instantiated with an `nlp` object)\n\n## Installation\n\n`pip install replacy`\n\n## Quick start\n\n```python\nfrom replacy import ReplaceMatcher\nfrom replacy.db import load_json\nimport spacy\n\n\nmatch_dict = load_json(\'/path/to/your/match/dict.json\')\n# load nlp spacy model of your choice\nnlp = spacy.load("en_core_web_sm")\n\nrmatcher = ReplaceMatcher(nlp, match_dict=match_dict)\n\n# get inflected suggestions\n# look up the first suggestion\nspan = rmatcher("She extracts revenge.")[0]\nspan._.suggestions\n# >>> [\'exacts\']\n```\n\n## Input\n\nReplaceMatcher accepts both text and spaCy doc.\n\n```python\n# text is ok\nspan = r_matcher("She extracts revenge.")[0]\n\n# doc is ok too\ndoc = nlp("She extracts revenge.")\nspan = r_matcher(doc)[0]\n```\n\n## match_dict.json format\n\nHere is a minimal `match_dict.json`:\n\n```json\n{\n "extract-revenge": {\n "patterns": [\n {\n "LEMMA": "extract",\n "TEMPLATE_ID": 1\n }\n ],\n "suggestions": [\n [\n {\n "TEXT": "exact",\n "FROM_TEMPLATE_ID": 1\n }\n ]\n ],\n "match_hook": [\n {\n "name": "succeeded_by_phrase",\n "args": "revenge",\n "match_if_predicate_is": true\n }\n ],\n "test": {\n "positive": [\n "And at the same time extract revenge on those he so despises?",\n "Watch as Tampa Bay extracts revenge against his former Los Angeles Rams team."\n ],\n "negative": ["Mother flavours her custards with lemon extract."]\n }\n }\n}\n```\nFor more information how to compose `match_dict` see our [wiki](https://github.com/Qordobacode/replaCy/wiki/match_dict.json-format): \n\n\n# Citing\n\nIf you use replaCy in your research, please cite with the following BibText\n\n```bibtext\n@misc{havens2019replacy,\n title = {SpaCy match and replace, maintaining conjugation},\n author = {Sam Havens, Aneta Stal, and Manhal Daaboul},\n url = {https://github.com/Qordobacode/replaCy},\n year = {2019}\n}\n', |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
112639
1.31%2745
0.44%