Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
1913 | jaeger | 1 | class Token: |
2 | def __init__(self,pos=(0,0),type='symbol',val=None,items=None): |
||
3 | self.pos,self.type,self.val,self.items=pos,type,val,items |
||
4 | |||
5 | def u_error(ctx,s,i): |
||
6 | y,x = i |
||
7 | line = s.split('\n')[y-1] |
||
8 | p = '' |
||
9 | if y < 10: p += ' ' |
||
10 | if y < 100: p += ' ' |
||
11 | r = p + str(y) + ": " + line + "\n" |
||
12 | r += " "+" "*x+"^" +'\n' |
||
13 | raise 'error: '+ctx+'\n'+r |
||
14 | |||
15 | ISYMBOLS = '`-=[];,./~!@$%^&*()+{}:<>?' |
||
16 | SYMBOLS = [ |
||
17 | 'def','class','yield','return','pass','and','or','not','in','import', |
||
18 | 'is','while','break','for','continue','if','else','elif','try', |
||
19 | 'except','raise','True','False','None','global','del','from', |
||
20 | '-','+','*','**','/','%','<<','>>', |
||
21 | '-=','+=','*=','/=','=','==','!=','<','>', |
||
22 | '<=','>=','[',']','{','}','(',')','.',':',',',';','&','|','!', |
||
23 | ] |
||
24 | B_BEGIN,B_END = ['[','(','{'],[']',')','}'] |
||
25 | |||
26 | class TData: |
||
27 | def __init__(self): |
||
28 | self.y,self.yi,self.nl = 1,0,True |
||
29 | self.res,self.indent,self.braces = [],[0],0 |
||
30 | def add(self,t,v): self.res.append(Token(self.f,t,v)) |
||
31 | |||
32 | def clean(s): |
||
33 | s = s.replace('\r\n','\n') |
||
34 | s = s.replace('\r','\n') |
||
35 | return s |
||
36 | |||
37 | def tokenize(s): |
||
38 | s = clean(s) |
||
39 | try: return do_tokenize(s) |
||
40 | except: u_error('tokenize',s,T.f) |
||
41 | |||
42 | def do_tokenize(s): |
||
43 | global T |
||
44 | T,i,l = TData(),0,len(s) |
||
45 | T.f = (T.y,i-T.yi+1) |
||
46 | while i < l: |
||
47 | c = s[i]; T.f = (T.y,i-T.yi+1) |
||
48 | if T.nl: T.nl = False; i = do_indent(s,i,l) |
||
49 | elif c == '\n': i = do_nl(s,i,l) |
||
50 | elif c in ISYMBOLS: i = do_symbol(s,i,l) |
||
51 | elif c >= '0' and c <= '9': i = do_number(s,i,l) |
||
52 | elif (c >= 'a' and c <= 'z') or \ |
||
53 | (c >= 'A' and c <= 'Z') or c == '_': i = do_name(s,i,l) |
||
54 | elif c=='"' or c=="'": i = do_string(s,i,l) |
||
55 | elif c=='#': i = do_comment(s,i,l) |
||
56 | elif c == '\\' and s[i+1] == '\n': |
||
57 | i += 2; T.y,T.yi = T.y+1,i |
||
58 | elif c == ' ' or c == '\t': i += 1 |
||
59 | else: u_error('tokenize',s,T.f) |
||
60 | indent(0) |
||
61 | r = T.res; T = None |
||
62 | return r |
||
63 | |||
64 | def do_nl(s,i,l): |
||
65 | if not T.braces: |
||
66 | T.add('nl',None) |
||
67 | i,T.nl = i+1,True |
||
68 | T.y,T.yi = T.y+1,i |
||
69 | return i |
||
70 | |||
71 | def do_indent(s,i,l): |
||
72 | v = 0 |
||
73 | while i |
||
74 | c = s[i] |
||
75 | if c != ' ' and c != '\t': break |
||
76 | i,v = i+1,v+1 |
||
77 | if c != '\n' and c != '#' and not T.braces: indent(v) |
||
78 | return i |
||
79 | |||
80 | def indent(v): |
||
81 | if v == T.indent[-1]: pass |
||
82 | elif v > T.indent[-1]: |
||
83 | T.indent.append(v) |
||
84 | T.add('indent',v) |
||
85 | elif v < T.indent[-1]: |
||
86 | n = T.indent.index(v) |
||
87 | while len(T.indent) > n+1: |
||
88 | v = T.indent.pop() |
||
89 | T.add('dedent',v) |
||
90 | |||
91 | |||
92 | def do_symbol(s,i,l): |
||
93 | symbols = [] |
||
94 | v,f,i = s[i],i,i+1 |
||
95 | if v in SYMBOLS: symbols.append(v) |
||
96 | while i |
||
97 | c = s[i] |
||
98 | if not c in ISYMBOLS: break |
||
99 | v,i = v+c,i+1 |
||
100 | if v in SYMBOLS: symbols.append(v) |
||
101 | v = symbols.pop(); n = len(v); i = f+n |
||
102 | T.add('symbol',v) |
||
103 | if v in B_BEGIN: T.braces += 1 |
||
104 | if v in B_END: T.braces -= 1 |
||
105 | return i |
||
106 | |||
107 | def do_number(s,i,l): |
||
108 | v,i,c =s[i],i+1,s[i] |
||
109 | while i |
||
110 | c = s[i] |
||
111 | if (c < '0' or c > '9') and (c < 'a' or c > 'f') and c != 'x': break |
||
112 | v,i = v+c,i+1 |
||
113 | if c == '.': |
||
114 | v,i = v+c,i+1 |
||
115 | while i |
||
116 | c = s[i] |
||
117 | if c < '0' or c > '9': break |
||
118 | v,i = v+c,i+1 |
||
119 | T.add('number',v) |
||
120 | return i |
||
121 | |||
122 | def do_name(s,i,l): |
||
123 | v,i =s[i],i+1 |
||
124 | while i |
||
125 | c = s[i] |
||
126 | if (c < 'a' or c > 'z') and (c < 'A' or c > 'Z') and (c < '0' or c > '9') and c != '_': break |
||
127 | v,i = v+c,i+1 |
||
128 | if v in SYMBOLS: T.add('symbol',v) |
||
129 | else: T.add('name',v) |
||
130 | return i |
||
131 | |||
132 | def do_string(s,i,l): |
||
133 | v,q,i = '',s[i],i+1 |
||
134 | if (l-i) >= 5 and s[i] == q and s[i+1] == q: # """ |
||
135 | i += 2 |
||
136 | while i |
||
137 | c = s[i] |
||
138 | if c == q and s[i+1] == q and s[i+2] == q: |
||
139 | i += 3 |
||
140 | T.add('string',v) |
||
141 | break |
||
142 | else: |
||
143 | v,i = v+c,i+1 |
||
144 | if c == '\n': T.y,T.yi = T.y+1,i |
||
145 | else: |
||
146 | while i |
||
147 | c = s[i] |
||
148 | if c == "\\": |
||
149 | i = i+1; c = s[i] |
||
150 | if c == "n": c = '\n' |
||
151 | if c == "r": c = chr(13) |
||
152 | if c == "t": c = "\t" |
||
153 | if c == "0": c = "\0" |
||
154 | v,i = v+c,i+1 |
||
155 | elif c == q: |
||
156 | i += 1 |
||
157 | T.add('string',v) |
||
158 | break |
||
159 | else: |
||
160 | v,i = v+c,i+1 |
||
161 | return i |
||
162 | |||
163 | def do_comment(s,i,l): |
||
164 | i += 1 |
||
165 | while i |
||
166 | c = s[i] |
||
167 | if c == '\n': break |
||
168 | i += 1 |
||
169 | return i |
||
170 | |||
171 |