Python实现的C语言词法分析

时间:2022-05-13 16:54:50

 编译原理课上的一个实验是做一个编译器前端的词法分析器,我选择了用Python来写C语言的词法分析。

 

         词法分析器的功能是输入源程序,输出单词符号。当初定义Token(单词种别,属性值)序列的时候,是将单词种别用数字来表示,后来再做语法分析的时候,发现用数字时不太合理的,所以又对单词的种别码进行了一番修改。

         我的程序的总体思路是先对源程序进行一遍扫描,将多余的空格和注释去除,然后再读一遍已经进行过预处理的源程序,进行单词的识别,转换成二元组,保存到token文件中,并建立符号表对标识符进行管理,如果发现了错误,对其的位置和错误信息进行打印。

         在对单词的识别部分,我采用了有穷自动机的理论来进行识别。这样就可以根据现在的状态和输入符号决定其后继行为。因此在对单词的识别中,我画了很多的状态图来识别不同的单词,如字符串、数字等等。状态图的绘制中,本来想用visio来画的,后来的后来觉得太麻烦了,还是用了最快的手画的方法。

Python实现的C语言词法分析

图1.注释的状态转换图

Python实现的C语言词法分析

图2.标志符的状态装换图

Python实现的C语言词法分析

图3.字符串的状态装换图

 Python实现的C语言词法分析

图4.界符的状态转换图

 

Python实现的C语言词法分析

图5.整常数、浮点常数的状态转换图

 Python实现的C语言词法分析

图6.字符常数的状态转换图

         关于错误处理的方面,我对于词法分析阶段所能遇到的几种错误,如下图所示中的四种中的前三种都进行了相应的处理。但是对于第三点做的不太好,对字符常数中可以出现的字符限制的有点过于厉害,例如分号等在我的词法分析器中是不能再字符串中出现的。

Python实现的C语言词法分析

图7.词法分析中的四种错误

测试程序如下,内包含主要的C语言的各种语句,含有少量的错误:

int main(){    int _a;         char ch = 'f;    floatb,centigrade,fahrj@enheit;         char fd = '\n';     printf("please inputa);   scanf("%d",&a); /*mycomment1or***2*/    printf("please inputb");   scanf("%f",&b);    if (a==8.1.6)    {       centigrade=095*(b-32)/9; /*itismyc5435omment*/        printf("TheCentigrade is ",centigrade);  /*mess/age*/    }    else if (a!=0)    {       fahrenheit=(9/5.0)*b++32; /*mycontent*/        printf("TheFahrenheit is fahrenheit);   /*hello****/    }    return 0;}


 

运行结果如下图所示:Python实现的C语言词法分析

图8.测试程序的错误报告

这是用Python写的第一个稍微像点样的东西,所以很多地方写的不大好,代码结构也是有点混乱。总而言之,就是在这样的条件下把编译原理的第一次实验给写完了。接下来是我的水水的代码了。

# -*- coding: utf-8 -*- 
'''
Created on 2012-10-18

@author: zouliping
'''
import string

_key = ("auto","break","case","char","const","continue","default",

"do","double","else","enum","extern","float","for",

"goto","if","int","long","register","return","short",

"signed","static","sizeof","struct","switch","typedef","union",

"unsigned","void","volatile","while") # c语言的32个关键字

_abnormalChar = '@#$%^&*~' #标识符中可能出现的非法字符

_syn = '' #单词的种别码
_p = 0 #下标
_value = '' #存放词法分析出的单词
_content = '' #程序内容
_mstate = 0 #字符串的状态
_cstate = 0 #字符的状态
_dstate = 0 #整数和浮点数的状态
_line = 1 #代码的第几行
_mysymbol = [] #符号表

def outOfComment():
'''去除代码中的注释'''
global _content
state = 0
index = -1

for c in _content:
index = index + 1

if state == 0:
if c == '/':
state = 1
startIndex = index

elif state == 1:
if c == '*':
state = 2
else:
state = 0

elif state == 2:
if c == '*':
state = 3
else:
pass

elif state == 3:
if c == '/':
endIndex = index + 1
comment = _content[startIndex:endIndex]
_content = _content.replace(comment,'') #将注释替换为空,并且将下标移动
index = startIndex - 1
state = 0

elif c == '*':
pass
else:
state = 2

def getMyProm():
'''从文件中获取代码片段'''
global _content
myPro = open(r'E://test.txt','r')

for line in myPro:
if line != '\n':
_content = "%s%s" %(_content,line.lstrip()) #效率更高的字符串拼接方法
else:
_content = "%s%s" %(_content,line)
myPro.close()

def analysis(mystr):
'''分析目标代码,生成token'''
global _p,_value,_syn,_mstate,_dstate,_line,_cstate

_value = ''
ch = mystr[_p]
_p += 1
while ch == ' ':
ch = mystr[_p]
_p += 1
if ch in string.letters or ch == '_': ###############letter(letter|digit)*
while ch in string.letters or ch in string.digits or ch == '_' or ch in _abnormalChar:
_value += ch
ch = mystr[_p]
_p += 1
_p -= 1

for abnormal in _abnormalChar:
if abnormal in _value:
_syn = '@-6' #错误代码,标识符中含有非法字符
break
else:
_syn = 'ID'

for s in _key:
if cmp(s,_value) == 0:
_syn = _value.upper() #############关键字
break
if _syn == 'ID':
inSymbolTable(_value)

elif ch == '\"': #############字符串
while ch in string.letters or ch in '\"% ' :
_value += ch
if _mstate == 0:
if ch == '\"':
_mstate = 1
elif _mstate == 1:
if ch == '\"':
_mstate = 2

ch = mystr[_p]
_p += 1

if _mstate == 1:
_syn = '@-2' #错误代码,字符串不封闭
_mstate = 0

elif _mstate == 2:
_mstate = 0
_syn = 'STRING'

_p -= 1

elif ch in string.digits:
while ch in string.digits or ch == '.' or ch in string.letters:
_value += ch
if _dstate == 0:
if ch == '0':
_dstate = 1
else:
_dstate = 2

elif _dstate == 1:
if ch == '.':
_dstate = 3
else:
_dstate = 5

elif _dstate == 2:
if ch == '.':
_dstate = 3

ch = mystr[_p]
_p += 1

for char in string.letters:
if char in _value:
_syn = '@-7' #错误代码,数字和字母混合,如12AB56等
_dstate = 0


if _syn != '@-7':
if _dstate == 5:
_syn = '@-3' #错误代码,数字以0开头
_dstate = 0
else:
_dstate = 0
if '.' not in _value:
_syn = 'DIGIT' ##################digit digit*
else:
if _value.count('.') == 1:
_syn = 'FRACTION' ################## 浮点数
else:
_syn = '@-5' #错误代码,浮点数中包含多个点,如1.2.3
_p -= 1


elif ch == '\'': ################## 字符
while ch in string.letters or ch in '@#$%&*\\\'\"':
_value += ch
if _cstate == 0:
if ch == '\'':
_cstate = 1

elif _cstate == 1:
if ch == '\\':
_cstate = 2
elif ch in string.letters or ch in '@#$%&*':
_cstate = 3

elif _cstate == 2:
if ch in 'nt':
_cstate = 3

elif _cstate == 3:
if ch == '\'':
_cstate = 4
ch = mystr[_p]
_p += 1

_p -= 1
if _cstate == 4:
_syn = 'CHARACTER'
_cstate = 0
else:
_syn = '@-4' #错误代码,字符不封闭
_cstate = 0

elif ch == '<':
_value = ch
ch = mystr[_p]

if ch == '=': ########### '<='
_value += ch
_p += 1
_syn = '<='
else: ########### '<'
_syn = '<'

elif ch == '>':
_value = ch
ch = mystr[_p]

if ch == '=': ########### '>='
_value += ch
_p += 1
_syn = '>='
else: ########## '>'
_syn = '>'

elif ch == '!':
_value = ch
ch = mystr[_p]

if ch == '=': ########## '!='
_value += ch
_p += 1
_syn = '!='
else: ########## '!'
_syn = '!'


elif ch == '+':
_value = ch
ch = mystr[_p]

if ch =='+': ############ '++'
_value += ch
_p += 1
_syn = '++'
else : ############ '+'
_syn = '+'

elif ch == '-':
_value = ch
ch = mystr[_p]

if ch =='-': ########### '--'
_value += ch
_p += 1
_syn = '--'
else : ########### '-'
_syn = '-'

elif ch == '=':
_value = ch
ch = mystr[_p]

if ch =='=': ########### '=='
_value += ch
_p += 1
_syn = '=='
else : ########### '='
_syn = '='

elif ch == '&':
_value = ch
ch = mystr[_p]

if ch == '&': ########### '&&'
_value += ch
_p += 1
_syn = '&&'
else: ########### '&'
_syn = '&'

elif ch == '|':
_value = ch
ch = mystr[_p]

if ch == '|': ########## '||'
_value += ch
_p += 1
_syn = '||'
else: ########## '|'
_syn = '|'

elif ch == '*': ########## '*'
_value = ch
_syn = '*'

elif ch == '/': ########## '/'
_value = ch
_syn = '/'

elif ch ==';': ########## ';'
_value = ch
_syn = ';'

elif ch == '(': ########## '('
_value = ch
_syn = '('

elif ch == ')': ########### ')'
_value = ch
_syn = ')'

elif ch == '{': ########### '{'
_value = ch
_syn = '{'

elif ch == '}': ########### '}'
_value = ch
_syn = '}'

elif ch == '[': ########### '['
_value = ch
_syn = '['

elif ch == ']': ########### ']'
_value = ch
_syn = ']'

elif ch == ',': ########## ','
_value = ch
_syn = ','
elif ch == '\n':
_syn = '@-1'

def inSymbolTable(token):
'''将关键字和标识符存进符号表'''
global _mysymbol
if token not in _mysymbol:
_mysymbol.append(token)

if __name__ == '__main__':
getMyProm()
outOfComment()

symbolTableFile = open(r'E://symbol_table.txt','w')
tokenFile = open(r'E://token.txt','w')
while _p != len(_content):
analysis(_content)
if _syn == '@-1':
_line += 1 #记录程序的行数
elif _syn == '@-2':
print '字符串 ' + _value + ' 不封闭! Error in line ' + str(_line)
elif _syn == '@-3':
print '数字 ' + _value + ' 错误,不能以0开头! Error in line ' + str(_line)
elif _syn == '@-4':
print '字符 ' + _value + ' 不封闭! Error in line ' + str(_line)
elif _syn == '@-5':
print '数字 ' + _value + ' 不合法! Error in line ' + str(_line)
elif _syn == '@-6':
print '标识符' + _value + ' 不能包含非法字符!Error in line ' + str(_line)
elif _syn == '@-7':
print '数字 ' + _value + ' 不合法,包含字母! Error in line ' + str(_line)
else: #若程序中无词法错误的情况
#print (_syn,_value)
tokenFile.write(str(_syn)+'@'+_value+'\n')

tokenFile.close()
symbolTableFile.write('入口地址\t变量名\n')
i = 0
for symbolItem in _mysymbol:
symbolTableFile.write(str(i)+'\t\t\t'+symbolItem+'\n')
i += 1
symbolTableFile.close()