Skip to content

Commit ce2661c

Browse files
committed
Fix issue #2: It is now possible to define new handler for each language
1 parent bd1dbdd commit ce2661c

File tree

8 files changed

+73
-34
lines changed

8 files changed

+73
-34
lines changed

code_tokenize/config.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11

22
import json
33

4+
from .lang.base_visitors import LeafVisitor
5+
46

57
class TokenizationConfig:
68
"""Helper object to translate arguments of tokenize to config object"""
@@ -17,7 +19,7 @@ def __init__(self, lang, **kwargs):
1719
"*_statement", "*_definition", "*_declaration"
1820
]
1921

20-
self.visitors = [] # Additional visitor which should be run during analysis
22+
self.visitors = [LeafVisitor] # visitor classes which should be run during analysis
2123

2224
self.update(kwargs)
2325

code_tokenize/lang/__init__.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
from ..config import TokenizationConfig
2+
from ..config import TokenizationConfig
33

44
from .python import create_tokenization_config as pytok_config
55
from .java import create_tokenization_config as jvtok_config
@@ -8,8 +8,6 @@
88
from .php import create_tokenization_config as phptok_config
99
from .ruby import create_tokenization_config as rubytok_config
1010

11-
from .python.indent import IndentVisitor as PythonIndentVisitor
12-
1311

1412
def load_from_lang_config(lang, **kwargs):
1513

@@ -19,13 +17,7 @@ def load_from_lang_config(lang, **kwargs):
1917
elif lang == "javascript" : base_config = jstok_config()
2018
elif lang == "php" : base_config = phptok_config()
2119
elif lang == "ruby" : base_config = rubytok_config()
22-
else : base_config = TokenizationConfig(lang)
20+
else : base_config = TokenizationConfig(lang)
2321

2422
base_config.update(kwargs)
2523
return base_config
26-
27-
28-
def indent_handler_clazz(lang):
29-
if lang == "python": return PythonIndentVisitor
30-
31-
# TODO: Ruby indent visitor
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from code_ast import ASTVisitor
2+
3+
# Basic visitor -----------------------------------------------------------
4+
5+
class LeafVisitor(ASTVisitor):
6+
7+
def __init__(self, node_handler):
8+
self.node_handler = node_handler
9+
10+
def visit_string(self, node):
11+
self.node_handler(node)
12+
return False
13+
14+
def visit(self, node):
15+
if node.child_count == 0:
16+
self.node_handler(node)
17+
return False

code_tokenize/lang/go/__init__.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,30 @@
11

22
from ...config import TokenizationConfig
3+
from ...tokens import NewlineToken
4+
5+
from ..base_visitors import LeafVisitor
6+
37

48
# Tokenization config ----------------------------------------------------------------
59

610
def create_tokenization_config():
711
return TokenizationConfig(
812
lang = 'go',
913
statement_types = ["*_statement", "*_declaration"],
14+
visitors = [GoLeafVisitor],
1015
indent_tokens = False
11-
)
16+
)
17+
18+
# Custom leaf visitor ----------------------------------------------------------------
19+
20+
class GoLeafVisitor(LeafVisitor):
21+
22+
def visit_interpreted_string_literal(self, node):
23+
self.node_handler(node)
24+
return False
25+
26+
def visit(self, node):
27+
if node.type == "\n":
28+
self.node_handler.handle_token(NewlineToken(self.node_handler.config))
29+
return False
30+
return super().visit(node)
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,25 @@
11

22
from ...config import TokenizationConfig
33

4+
from ..base_visitors import LeafVisitor
5+
from .indent import IndentVisitor
6+
47

58
# Tokenization config ----------------------------------------------------------------
69

710
def create_tokenization_config():
811
return TokenizationConfig(
912
lang = "python",
1013
statement_types = ["*_statement", "*_definition"],
14+
visitors = [PythonLeafVisitor, IndentVisitor],
1115
indent_tokens = True
1216
)
17+
18+
# Custom leaf visitor ----------------------------------------------------------------
19+
20+
class PythonLeafVisitor(LeafVisitor):
21+
22+
def visit_unary_operator(self, node):
23+
if node.children[-1].type == "integer":
24+
self.node_handler(node)
25+
return False

code_tokenize/tokenizer.py

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
from code_ast.visitor import ASTVisitor, ResumingVisitorComposition
44

55
from .tokens import ASTToken, TokenSequence
6-
from .lang import indent_handler_clazz
7-
86

97

108
# Interface ----------------------------------------------------------------
@@ -66,12 +64,14 @@ def _create_token_handler(self, code_lines):
6664

6765
def _create_tree_visitors(self, token_handler, visitors = None):
6866
visitors = visitors or []
67+
visitors += self._visitor_factories
6968

70-
visitors += [visitor_fn(token_handler)
71-
for visitor_fn in self._visitor_factories]
69+
visitors = [visitor_fn(token_handler)
70+
if callable(visitor_fn)
71+
else visitor_fn
72+
for visitor_fn in visitors]
7273

7374
return ResumingVisitorComposition(
74-
LeafVisitor(self.config, token_handler),
7575
ErrorVisitor(self.config),
7676
*visitors
7777
)
@@ -88,35 +88,21 @@ def __call__(self, code_tree, code_lines, visitors = None):
8888

8989
def create_tokenizer(config):
9090
"""Function to create tokenizer based on configuration"""
91-
tokenizer = Tokenizer(config)
92-
93-
if config.indent_tokens:
94-
indent_handler = indent_handler_clazz(config.lang)
95-
assert indent_handler is not None, "Language %s does not support indentation handling" % config.lang
96-
tokenizer.append_visitor(indent_handler)
97-
98-
return tokenizer
91+
return Tokenizer(config)
9992

10093

10194
# Basic visitor -----------------------------------------------------------
10295

10396

10497
class LeafVisitor(ASTVisitor):
10598

106-
def __init__(self, config, node_handler):
107-
self.config = config
99+
def __init__(self, node_handler):
108100
self.node_handler = node_handler
109101

110102
def visit_string(self, node):
111103
self.node_handler(node)
112104
return False
113105

114-
def visit_unary_operator(self, node):
115-
# TODO Use a custom leaf visitor
116-
if node.children[-1].type == "integer":
117-
self.node_handler(node)
118-
return False
119-
120106
def visit(self, node):
121107
if node.child_count == 0:
122108
self.node_handler(node)

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
tree_sitter==0.19.0
22
requests==2.25.1
3-
GitPython==3.1.18
3+
GitPython==3.1.18
4+
code_ast==0.1.0

tests/test_tokenization.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,13 @@ def test_error_handling(self):
4242
def test_error_handling2(self):
4343
tokens = ctok.tokenize("public int myFunc(int x){\n x = x + 1;\n return x;\n}", lang = "java", syntax_error = "ignore")
4444
expected = ["public", "int", "myFunc", "", "(", "int", "x", ")", "{", "x", "=", "x", "+", "1", ";", "return", "x", ";", "}"]
45+
self.assertEqual(expected, [str(t) for t in tokens])
46+
47+
48+
class GoTokenizationTest(TestCase):
49+
50+
def test_tokenize1(self):
51+
tokens = ctok.tokenize('func main(){\n tip1 := "test"\n}', lang = "go")
52+
expected = ["func", "main", "(", ")", "{", "tip1", ":=", '"test"', "#NEWLINE#", "}"]
53+
4554
self.assertEqual(expected, [str(t) for t in tokens])

0 commit comments

Comments
 (0)