11# -*- coding: utf-8 -*-
2+ from __future__ import print_function
23import re
34import os
45import sys
6+
7+ try :
8+ xrange # Python 2
9+ except NameError :
10+ xrange = range # Python 3
11+
512class SEG (object ):
613 def __init__ (self ):
714 _localDir = os .path .dirname (__file__ )
815 _curpath = os .path .normpath (os .path .join (os .getcwd (),_localDir ))
916 curpath = _curpath
1017 self .d = {}
11- print >> sys . stderr , "loading dict..."
18+ print ( "loading dict..." , file = sys . stderr )
1219 self .set ([x .rstrip () for x in file (os .path .join (curpath ,"main.dic" )) ])
1320 self .specialwords = set ([x .rstrip ().decode ('utf-8' ) for x in file (os .path .join (curpath ,"suffix.dic" ))])
14- print >> sys . stderr , 'dict ok.'
21+ print ( 'dict ok.' , file = sys . stderr )
1522 #set dictionary(a list)
1623 def set (self ,keywords ):
1724 p = self .d
@@ -33,8 +40,6 @@ def set(self,keywords):
3340 q = p
3441 k = char
3542 p = p [char ]
36-
37- pass
3843
3944 def _binary_seg (self ,s ):
4045 ln = len (s )
@@ -47,7 +52,7 @@ def _binary_seg(self,s):
4752 return R
4853
4954 def _pro_unreg (self ,piece ):
50- #print piece
55+ #print( piece)
5156 R = []
5257 tmp = re .sub (u"。|,|,|!|…|!|《|》|<|>|\" |'|:|:|?|\?|、|\||“|”|‘|’|;|—|(|)|·|\(|\)| " ," " ,piece ).split ()
5358 ln1 = len (tmp )
@@ -77,7 +82,7 @@ def cut(self,text):
7782 mem2 = None
7883 while i - j > 0 :
7984 t = text [i - j - 1 ].lower ()
80- #print i,j,t,mem
85+ #print( i,j,t,mem)
8186 if not (t in p ):
8287 if (mem != None ) or (mem2 != None ):
8388 if mem != None :
@@ -88,7 +93,7 @@ def cut(self,text):
8893 if delta >= 1 :
8994 if (delta < 5 ) and (re .search (u"[\w\u2E80 -\u9FFF ]" ,t )!= None ):
9095 pre = text [i - j ]
91- #print pre
96+ #print( pre)
9297 if not (pre in self .specialwords ):
9398 i ,j ,z ,q = mem2
9499 del recognised [q :]
@@ -99,7 +104,7 @@ def cut(self,text):
99104 unreg_tmp = self ._pro_unreg (text [i :z ])
100105 recognised .extend (unreg_tmp )
101106 recognised .append (text [i - j :i ])
102- #print text[i-j:i],mem2
107+ #print( text[i-j:i],mem2)
103108 i = i - j
104109 z = i
105110 j = 0
@@ -113,18 +118,18 @@ def cut(self,text):
113118 if chr (11 ) in p :
114119 if j <= 2 :
115120 mem = i ,j ,z
116- #print text[i-1]
121+ #print( text[i-1])
117122 if (z - i < 2 ) and (text [i - 1 ] in self .specialwords ) and ((mem2 == None ) or ((mem2 != None and mem2 [0 ]- i > 1 ))):
118- #print text[i-1]
123+ #print( text[i-1])
119124 mem = None
120125 mem2 = i ,j ,z ,len (recognised )
121126 p = self .d
122127 i -= 1
123128 j = 0
124129 continue
125- #print mem
130+ #print( mem)
126131 p = self .d
127- #print i,j,z,text[i:z]
132+ #print( i,j,z,text[i:z])
128133 if ((i < ln ) and (i < z )):
129134 unreg_tmp = self ._pro_unreg (text [i :z ])
130135 recognised .extend (unreg_tmp )
@@ -134,7 +139,7 @@ def cut(self,text):
134139 j = 0
135140 mem = None
136141 mem2 = None
137- #print mem
142+ #print( mem)
138143 if mem != None :
139144 i ,j ,z = mem
140145 recognised .extend (self ._pro_unreg (text [i :z ]))
0 commit comments