:- module( text, [ tokenize/2 ]). tokenize([],[]). tokenize([Char|Chars0],Tokens0) :- ( tokenize(Tokens0,Tokens1,[Char|Chars0],Chars1,Goal), look_ahead(Chars1,0) -> call(Goal), tokenize(Chars1,Tokens1) ; error(Char,Chars0,Tokens0) ). look_ahead([],_). look_ahead([H|T],N) :- look_aheadX(N,[H|T]). look_aheadX(0,_) :- !. look_aheadX(N0,[H|T]) :- tokenize(_,_,[H|T],Str,_), N is N0-1, look_ahead(Str,N). tokenize(Tokens0,Tokens,String0,String,Goal):- tokenize(0,Tokens0,Tokens,String0,String,Symbol,[],Symbol,Goal). tokenize(0,Ts0,Ts,[C|S0],S,[C|Sy0],Sy,T,G):- ( C >= 9, C =< 10 -> tokenize(8,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C =:= 32 -> tokenize(8,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C >= 48, C =< 57 -> tokenize(1,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C >= 65, C =< 90 -> tokenize(7,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C >= 97, C =< 122 -> tokenize(7,Ts0,Ts,S0,S,Sy0,Sy,T,G)). tokenize(1,Ts0,Ts,[C|S0],S,[C|Sy0],Sy,T,G):- ( C =:= 46 -> tokenize(2,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C >= 48, C =< 57 -> tokenize(1,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C =:= 69 -> tokenize(4,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C =:= 101 -> tokenize(4,Ts0,Ts,S0,S,Sy0,Sy,T,G)). tokenize(2,Ts0,Ts,[C|S0],S,[C|Sy0],Sy,T,G):- ( C >= 48, C =< 57 -> tokenize(3,Ts0,Ts,S0,S,Sy0,Sy,T,G)). tokenize(3,Ts0,Ts,[C|S0],S,[C|Sy0],Sy,T,G):- ( C >= 48, C =< 57 -> tokenize(3,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C =:= 69 -> tokenize(4,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C =:= 101 -> tokenize(4,Ts0,Ts,S0,S,Sy0,Sy,T,G)). tokenize(4,Ts0,Ts,[C|S0],S,[C|Sy0],Sy,T,G):- ( C =:= 43 -> tokenize(5,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C =:= 45 -> tokenize(5,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C >= 48, C =< 57 -> tokenize(6,Ts0,Ts,S0,S,Sy0,Sy,T,G)). tokenize(5,Ts0,Ts,[C|S0],S,[C|Sy0],Sy,T,G):- ( C >= 48, C =< 57 -> tokenize(6,Ts0,Ts,S0,S,Sy0,Sy,T,G)). tokenize(6,Ts0,Ts,[C|S0],S,[C|Sy0],Sy,T,G):- ( C >= 48, C =< 57 -> tokenize(6,Ts0,Ts,S0,S,Sy0,Sy,T,G)). tokenize(7,Ts0,Ts,[C|S0],S,[C|Sy0],Sy,T,G):- ( C =:= 45 -> tokenize(7,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C >= 65, C =< 90 -> tokenize(7,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C >= 97, C =< 122 -> tokenize(7,Ts0,Ts,S0,S,Sy0,Sy,T,G)). tokenize(8,Ts0,Ts,[C|S0],S,[C|Sy0],Sy,T,G):- ( C >= 9, C =< 10 -> tokenize(8,Ts0,Ts,S0,S,Sy0,Sy,T,G) ; C =:= 32 -> tokenize(8,Ts0,Ts,S0,S,Sy0,Sy,T,G)). tokenize(1,Ts0,Ts,S,S,Sy,Sy,T,'Number'(T,Ts0,Ts)). tokenize(3,Ts0,Ts,S,S,Sy,Sy,T,'Number'(T,Ts0,Ts)). tokenize(6,Ts0,Ts,S,S,Sy,Sy,T,'Number'(T,Ts0,Ts)). tokenize(7,Ts0,Ts,S,S,Sy,Sy,T,'Word'(T,Ts0,Ts)). tokenize(8,Ts0,Ts,S,S,Sy,Sy,T,'WhiteSpace'(T,Ts0,Ts)). 'Number'(__Token,__Tokens0,__Tokens):- number_chars(Number,__Token), __Tokens0=[number(Number)|__Tokens]. 'Word'(__Token,__Tokens0,__Tokens):- atom_chars(Word,__Token), __Tokens0=[w(Word)|__Tokens]. 'WhiteSpace'(__Token,__Tokens0,__Tokens):- __Tokens0=__Tokens. error(__Char,__Chars,__Tokens) :- __Tokens = [skip(__Char)|TokensRest], tokenize(__Chars,TokensRest).