244 lines
6.8 KiB
ObjectPascal
244 lines
6.8 KiB
ObjectPascal
unit AG.PascalTokeniser;
|
|
|
|
interface
|
|
|
|
implementation
|
|
|
|
(*
|
|
#
|
|
# PyPascalTokenizer
|
|
# Author: Artem Gavrilov (@Artem3213212)
|
|
# License: MPL 2.0
|
|
#
|
|
|
|
import queue, threading
|
|
|
|
SYMS1 = ['(',')','[',']','/','|','\\','@','#','=','>','<',':',';',',','.','$','+','-','*']
|
|
SYMS2 = ['>=','<=','<>',':=','..','-=','+=','/=','*=']
|
|
SPACES = ['\f','\n','\r','\t','\v',' ']
|
|
NO_NAME_SYMS = SYMS1 + SPACES + ['{','}']
|
|
CHARS_ID0 = '&abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
|
|
CHARS_ID = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
|
|
|
|
def is_comment(s):
|
|
if type(s) is list:
|
|
return True
|
|
else:
|
|
return s.startswith('{') or s.startswith('(*') or s.startswith('//')
|
|
|
|
def is_name(s):
|
|
if len(s)<=0:
|
|
return False
|
|
if s=='&':
|
|
return False
|
|
if not (s[0] in CHARS_ID0):
|
|
return False
|
|
for i in s[1:]:
|
|
if not (i in CHARS_ID):
|
|
return False
|
|
return True
|
|
|
|
def is_string(s):
|
|
return s.startswith("'")
|
|
|
|
class PasTokenizer():
|
|
def __init__(self, s):
|
|
self.s, self.y, self.x, self.ended = s, 0, 0, False
|
|
self._skip_spaces()
|
|
|
|
def _do_readable(self):
|
|
if not self._is_readable():
|
|
if self.y+1 == len(self.s):
|
|
self.ended = True
|
|
else:
|
|
self.y+=1
|
|
self.x=0
|
|
while not self.s[self.y]:
|
|
if self.y+1 == len(self.s):
|
|
self.ended = True
|
|
break
|
|
self.y+=1
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def _is_readable(self):
|
|
return len(self.s[self.y])>self.x
|
|
|
|
def _next_readable(self):
|
|
self.x+=1
|
|
return self._do_readable()
|
|
|
|
def _skip_spaces(self):
|
|
self._do_readable()
|
|
if not self.ended:
|
|
while self.s[self.y][self.x] in SPACES:
|
|
self._next_readable()
|
|
|
|
def _get_pos(self):
|
|
return self.y, self.x
|
|
|
|
def _set_pos(self, i0, i1):
|
|
self.y, self.x, self.ended = i0, i1, False
|
|
self._do_readable()
|
|
|
|
def get_next(self):
|
|
begin_pos = self._get_pos()
|
|
ml, ss, f = '', '', True
|
|
str_changed = False
|
|
while f and not self.ended:
|
|
line = self.s[self.y]
|
|
now_sym = line[self.x]
|
|
l = len(line)
|
|
if self.x+1 != l:
|
|
next_sym = line[self.x+1]
|
|
else:
|
|
next_sym = ''
|
|
if ml == '':
|
|
if now_sym == '/':
|
|
if next_sym == '/':
|
|
ss = line[self.x:]
|
|
self.x = l
|
|
break
|
|
elif now_sym == '{':
|
|
ml = '}'
|
|
ss=[now_sym]
|
|
last_i0 = self.y
|
|
elif now_sym == '(':
|
|
if next_sym == '*':
|
|
ml = ')'
|
|
self.x+=1
|
|
last_i0 = self.y
|
|
ss = [now_sym+next_sym]
|
|
else:
|
|
ss = '('
|
|
self.x+=1
|
|
break
|
|
else:
|
|
if now_sym in SYMS1:
|
|
ss = now_sym
|
|
self.x+=1
|
|
if now_sym + next_sym in SYMS2:
|
|
self.x+=1
|
|
ss = ss + next_sym
|
|
break
|
|
elif now_sym=="'":
|
|
ss="'"
|
|
self.x+=1
|
|
if next_sym!='':
|
|
ss = ss + next_sym
|
|
while line[self.x]!="'":
|
|
self.x+=1
|
|
if not self._is_readable():
|
|
self.x-=1
|
|
break
|
|
ss = ss + line[self.x]
|
|
self.x+=1
|
|
break
|
|
else:
|
|
while not(line[self.x] in NO_NAME_SYMS):
|
|
ss=ss+line[self.x]
|
|
self.x+=1
|
|
if not self._is_readable():
|
|
break
|
|
break
|
|
else:
|
|
while last_i0!=self.y:
|
|
ss.append('')
|
|
last_i0+=1
|
|
ss[-1] = ss[-1] + now_sym
|
|
if now_sym==ml:
|
|
if ml=='}':
|
|
self.x+=1
|
|
break
|
|
elif self.x!=0:
|
|
if line[self.x-1]=='*':
|
|
self.x+=1
|
|
break
|
|
self._next_readable()
|
|
if len(ss)==1:
|
|
ss=ss[0]
|
|
ss=(ss,begin_pos,self._get_pos(),self.ended)
|
|
self._skip_spaces()
|
|
return ss
|
|
|
|
def read_next(self):
|
|
i0, i1 = self._get_pos()
|
|
z = self.get_next()
|
|
self._set_pos(i0, i1)
|
|
return z
|
|
|
|
def is_ended(self):
|
|
return self.ended
|
|
|
|
class PasTokenizerStack():
|
|
def __init__(self, s, comments=True):
|
|
self.main = PasTokenizer(s)
|
|
self.stack = []
|
|
if comments:
|
|
self._pop = self._get_with_comments
|
|
else:
|
|
self._pop = self._get_without_comments
|
|
|
|
def _get_with_comments(self):
|
|
return self.main.get_next()
|
|
|
|
def _get_without_comments(self):
|
|
while True:
|
|
s = self.main.get_next()
|
|
if not is_comment(s[0]):
|
|
return s
|
|
if s[3]:
|
|
return ('',(0,0),(0,0),True)
|
|
|
|
def push(self, s):
|
|
self.stack.append(s)
|
|
|
|
def pop(self):
|
|
if self.stack:
|
|
return self.stack.pop()
|
|
else:
|
|
return self._pop()
|
|
|
|
def read_last(self):
|
|
if not self.stack:
|
|
self.stack.append(self._pop())
|
|
return self.stack[-1]
|
|
|
|
def is_ended(self):
|
|
return self.stack and self.main.is_ended()
|
|
|
|
class PasTokenizerParallelStack(PasTokenizerStack):
|
|
def __init__(self, s, comments = True, qlong = 1000):
|
|
super(PasTokenizerParallelStack,self).__init__(s, comments)
|
|
self.queue = queue.Queue(qlong)
|
|
th = threading.Thread(target = self._work, args = (self,))
|
|
th.start()
|
|
|
|
def _get_with_comments(self):
|
|
s = self.queue.get()
|
|
return s
|
|
|
|
def _get_without_comments(self):
|
|
while True:
|
|
s = self.queue.get()
|
|
if not is_comment(s[0]):
|
|
return s
|
|
if s[3]:
|
|
return ('',(0,0),(0,0),True)
|
|
|
|
def _work(self,s):
|
|
while not self.main.is_ended():
|
|
self.queue.put(self.main.get_next())
|
|
self.queue.put(('',(0,0),(0,0),True))
|
|
|
|
def is_ended(self):
|
|
return self.stack and self.main.is_ended()and self.queue.empty()
|
|
|
|
def stop(self):
|
|
self.main.ended = True
|
|
self.queue.task_done()
|
|
*)
|
|
|
|
end.
|