From 20f77e17f5eec134acd0e4c916ba8442251fb162 Mon Sep 17 00:00:00 2001 From: pgambhir88 <60377649+pgambhir88@users.noreply.github.com> Date: Sun, 4 Sep 2022 14:19:19 -0400 Subject: [PATCH] Update example.py --- example.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/example.py b/example.py index 41f292b..53e095f 100755 --- a/example.py +++ b/example.py @@ -1,11 +1,20 @@ import metapy def tokens_lowercase(doc): - #Write a token stream that tokenizes with ICUTokenizer (use the argument "suppress_tags=True"), + #Write a token stream that tokenizes with ICUTokenizer, #lowercases, removes words with less than 2 and more than 5 characters #performs stemming and creates trigrams (name the final call to ana.analyze as "trigrams") '''Place your code here''' + metapy.log_to_stderr() + tok = metapy.analyzers.ICUTokenizer(suppress_tags=True) + tok = metapy.analyzers.LowercaseFilter(tok) + tok = metapy.analyzers.LengthFilter(tok, min=2, max=5) + tok = metapy.analyzers.Porter2Filter(tok) + ana = metapy.analyzers.NGramWordAnalyzer(3, tok) + trigrams = ana.analyze(doc) + return trigrams + #leave the rest of the code as is tok.set_content(doc.content()) tokens, counts = [], []