编译原理课设 第一阶段
- GUET编译原理课设 词法分析
- 主要参考了 GUET_曼陀罗华 的博客(好像是位研究生姐姐),改了其中一些部分,在小姐姐基础上增加了出错控制。
- 其实有更好的方式,不过前期我是这样写的,就先贴出来,循序渐进。
正文如下:
词法分析是干什么的:
- 过滤掉所有的空格、换行、注释
- 把有用的东西存到
pascal[ ]
数组中
比如 BEGIN
存到pascal[0]
比如 VAR
存到pascal[1]
中
...
这个数组可以用于以后的语法和语义分析
pascal[ ]
数组是dual
类型的,除了保存单词的信息,还有单词的种类dual_type
这里写出思路,具体的小细节根据实际情况修改。
比如 关键字数组
、 类型码
等
符号 | 编号 | 助记符 |
---|---|---|
结束符 | 0 | FINISH |
BEGIN | 1 | BEGIN |
END | 2 | END |
IF | 3 | IF |
THEN | 4 | THEN |
WHILE | 5 | WHILE |
ELSE | 6 | ELSE |
DO | 7 | DO |
VAR | 8 | VAR |
INTEGER | 9 | INTEGER |
整数 | 10 | INT |
标识符 | 11 | ID |
+ | 101 | ADD |
- | 102 | SUB |
* | 103 | MUL |
/ | 104 | DIV |
> | 105 | GT |
= | 106 | EQ |
< | 107 | LT |
: | 108 | COLON |
:= | 109 | COL_EQ |
<> | 110 | NE |
<= | 111 | LE |
>= | 112 | GE |
; | 113 | FIN |
// | 114 | |
/* | 115 | |
*/ | 116 |
#include<stdio.h>#include<stdlib.h>#include<string.h>#include<iostream>#include<math.h>#include<ctype.h>#include<iomanip>using namespace std;struct dual { int dual_type; union { char lexeme_text[50]; int lexeme_num[50]; }lexeme; int x; int y;} DUAL[100];//校验通过的单词,存入到pascal中int pasnum = 0;dual pascal[100];//当前数组 DUAL 下标int num = 0;//关键字数组const char * keyword[] = { "sign","BEGIN","END","IF","THEN","WHILE" ,"ELSE","DO","VAR","INTEGER","INT" };//单分界符char singword[10] = "+-*=(),;";//双分界符打头,注释包含在这里char doubleword[10] = "><:/";//类型和帮记符int type[31] = {0,1,2,3,4,5,6,7,8,9,10,11,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119};const char * typesign[31] = { "FINISH","BEGIN","END","IF","THEN","WHILE" ,"ELSE","DO","VAR","INTEGER","INT","ID","ADD","SUB","MUL","DIV","GT","EQ","LT","COLON","COL_EQ","NE","LE","GE","FIN","ANND","ANNF","ANNL","CO","LL","RR" };int findSignIndex(int dual_type) { int i = 0; for (i; i < 31; i++){ if (type[i] == dual_type) { return i; } } return 0;}//整型数组转整数int toint(int lexeme_text[]) { int i = 0, length = 0, sum = 0; for (length; lexeme_text[length] != -1; length++); for (i; i < length; i++) { sum += lexeme_text[i] * pow(10, length - i - 1); } return sum;}//是否单分界符元素int isSingle(char ch) { int i; for (i = 0; i < 10; i++) { if (ch == singword[i]) { return 1; } } return 0;}//是否双分界符开头int isDoubelStar(char ch) { int i; for (i = 0; i < 5; i++) { if (ch == doubleword[i]) { return 1; } } return 0;}//处理单分界符元素//设置标识符类型//结构体,类型,字符值int handSingle(dual dual_element, int dual_type, char ch) { dual_element.dual_type = dual_type; dual_element.lexeme.lexeme_text[0] = ch; dual_element.lexeme.lexeme_text[0] = '\0'; //cout << "匹配到" << ch << endl; return 1;}//出错消息控制int errMsg(int err_type, int row, int column, const char msg[]) { cout << "Error" << err_type << ":" << row << "行 " << column << "列" << " 原因:" << msg << endl; return 1;}int scaner() { char ch; int i, j; int row = 1; int clumn = 1; int scan_success_flag = 1; FILE * file; file = fopen("a.txt", "r"); if (file == NULL) { return 0; } //通过getc获取字符 ch = getc(file); while (ch != EOF) { //换行 while (ch == '\n') { row++; clumn = 1; ch = getc(file); } //空格和tab,定义他们的长度都为1 while (ch == ' ' || ch == '\t') { clumn++; ch = getc(file); } //是字母 if (isalpha(ch)) { DUAL[num].lexeme.lexeme_text[0] = ch; //review DUAL[num].x = clumn; DUAL[num].y = row; //Token 下标移动到1 j = 1; ch = getc(file); clumn++; //抽取出来,做成检验函数,排除其他可能 while (isalpha(ch)) { DUAL[num].lexeme.lexeme_text[j++] = ch; ch = getc(file); clumn++; //j > 8说明单词超长,为了防止多次输出错误信息,设置为9 if (j == 9) { //cout << "单词超长" <<endl; errMsg(1011, row, clumn - 9, "单词超长"); scan_success_flag = 0; } } DUAL[num].lexeme.lexeme_text[j] = '\0'; //单词扫描结束,查关键字,抽取函数 for (i = 0; i < 11; i++) { if (strcmp(DUAL[num].lexeme.lexeme_text, keyword[i]) == 0) { DUAL[num].dual_type = i; pascal[pasnum++] = DUAL[num]; //printf("匹配到%s\n",keyword[i]); break; } } //标识符 if (i == 11) { DUAL[num].dual_type = 11; pascal[pasnum++] = DUAL[num]; } num++; } //是数字 else if (isdigit(ch)) { DUAL[num].lexeme.lexeme_num[0] = ch - '0'; DUAL[num].lexeme.lexeme_num[1] = -1; DUAL[num].x = row; DUAL[num].y = clumn; ch = getc(file); j = 1; while (isdigit(ch)) { DUAL[num].lexeme.lexeme_num[j++] = ch - '0'; DUAL[num].lexeme.lexeme_num[j] = -1; clumn++; //如果上一个ch是非数字后缀,错误??? 忘了 if (!isdigit(ch)) { DUAL[num].dual_type = -1; errMsg(1012, row, clumn, "非数字后缀"); scan_success_flag = 0; } ch = getc(file); } if (DUAL[num].dual_type == -1) { } //判断是否溢出,这里的转换需要review,初始化都为0的情况需要考虑 else { //int tempnum = toint(DUAL[num].lexeme.lexeme_num,j); int tempnum = toint(DUAL[num].lexeme.lexeme_num); if (tempnum > 65535 || tempnum < 0) { errMsg(1013, row, clumn, "数字过大,溢出"); scan_success_flag = 0; } else { DUAL[num].dual_type = 10;//整数类型 pascal[pasnum] = DUAL[num]; pasnum++; //cout << "匹配到数字" << tempnum << endl; } } num++; } //只可能是单分界符开头的,提取首字符位置,类似 indexOf 函数 else if (isSingle(ch)) { DUAL[num].x = row; DUAL[num].y = clumn++; DUAL[num].lexeme.lexeme_text[0] = ch; DUAL[num].lexeme.lexeme_text[1] = '\0'; //cout << "匹配到" << ch << endl; switch (ch) { case '+': DUAL[num].dual_type = 101; break; case '-': DUAL[num].dual_type = 102; break; case '*': DUAL[num].dual_type = 103; break; //todo 除属于双分界符情况,不在此讨论 case '=': DUAL[num].dual_type = 106; break; case ';': DUAL[num].dual_type = 113; break; case ',': DUAL[num].dual_type = 117; break; case '(': DUAL[num].dual_type = 118; break; case ')': DUAL[num].dual_type = 119; break; default: cout << "isSingle出错:" << ch << endl; break; } pascal[pasnum] = DUAL[num]; pasnum++; ch = getc(file); num++; } //双分界开头的 else if (isDoubelStar(ch)) { int isNote = 0; //默认不是注释 //DUAL[num].lexeme.lexeme_text[0] = ch; 因为注释就不能存放进去 //DUAL[num].lexeme.lexeme_text[1] = '\0'; char next_ch = getc(file); switch (ch) { case '<': //如果下一个是=,那么就是<= if (next_ch == '=') { DUAL[num].dual_type = 111; DUAL[num].lexeme.lexeme_text[0] = ch; DUAL[num].lexeme.lexeme_text[1] = next_ch; DUAL[num].lexeme.lexeme_text[2] = '\0'; ch = getc(file); //cout << "<=" << endl; } else if (next_ch == '>') { DUAL[num].dual_type = 110; DUAL[num].lexeme.lexeme_text[0] = ch; DUAL[num].lexeme.lexeme_text[1] = next_ch; DUAL[num].lexeme.lexeme_text[2] = '\0'; ch = getc(file); //cout << "< >" << endl; } else { //否则是单分界 DUAL[num].lexeme.lexeme_text[0] = ch; DUAL[num].lexeme.lexeme_text[1] = '\0'; DUAL[num].dual_type = 107; //作用相当于 getc(file),为下一次进入一级while循环做准备 ch = next_ch; //cout << "<" << endl; } break; case '>': //如果下一个是=,那么就是>= if (next_ch == '=') { DUAL[num].dual_type = 112; DUAL[num].lexeme.lexeme_text[0] = ch; DUAL[num].lexeme.lexeme_text[1] = next_ch; DUAL[num].lexeme.lexeme_text[2] = '\0'; ch = getc(file); //cout << ">=" << endl; } else { //否则是单分界 DUAL[num].lexeme.lexeme_text[0] = ch; DUAL[num].lexeme.lexeme_text[1] = '\0'; DUAL[num].dual_type = 105; ch = next_ch; //cout << ">" << endl; } break; case ':': //如果下一个是=,那么就是:= if (next_ch == '=') { DUAL[num].dual_type = 109; DUAL[num].lexeme.lexeme_text[0] = ch; DUAL[num].lexeme.lexeme_text[1] = next_ch; DUAL[num].lexeme.lexeme_text[2] = '\0'; ch = getc(file); //cout << ":=" << endl; } else { //否则出错 DUAL[num].dual_type = 108; ch = next_ch; errMsg(1014, row, clumn, "期待的 '=' 没有出现,':'之后缺少 '=' "); scan_success_flag = 0; } break; case '/': //单行注释 if (next_ch == '/') { row++; clumn = 1; isNote = 1; //cout << "// 检测到单行注释" << endl; ch = getc(file); while (ch != '\n') { ch = getc(file); } } //多行注释 else if (next_ch == '*') { isNote = 1; char ch1 = getc(file); char ch2 = getc(file); while (ch1 != '*' || ch2 != '/') { //处理坐标 if (ch1 == '\n') { row++; clumn = 1; } else { clumn++; } if (ch2 == '\n') { row++; clumn = 1; } else { clumn++; } //分析字符 if (ch2 == '*') { ch1 = ch2; ch2 = getc(file); } //包含了ch1 == ‘*’且ch2 != '/的情况 else { ch1 = getc(file); ch2 = getc(file); } //出错控制 if (ch1 == EOF || ch2 == EOF) { //没有期待的/出现或者已经到头 //cout << "多行注释出错" << endl; errMsg(1015, row, clumn, "没有期待的 '*/' 出现,不合法的注释"); break; } } ch = getc(file); } //排除其他可能,这只是一个单纯除号 else { DUAL[num].dual_type = 104; DUAL[num].lexeme.lexeme_text[0] = ch; DUAL[num].lexeme.lexeme_text[1] = '\0'; //作用相当于 getc(file),为下一次进入一级while循环做准备 ch = next_ch; } default: break; } if (!isNote) { pascal[pasnum] = DUAL[num]; pasnum++; num++; } } //其他字符 else { errMsg(1016, row, clumn, "非法字符"); scan_success_flag = 0; ch = getc(file); } } return scan_success_flag;}int main() { int i; if (scaner()) { cout << "====== 分析成功 ======" << endl; } cout << endl << "====== 输出扫描合法的词元记录 ======" << endl; cout << endl << " 单词 类型 助记符" << endl; for (i = 0; i < pasnum; i++) { //整数类型 if (pascal[i].dual_type == 10) { cout << " " << std::left << setw(12) << toint(pascal[i].lexeme.lexeme_num) << setw(8) << pascal[i].dual_type << typesign[findSignIndex(pascal[i].dual_type)]<< endl; } else { cout << " " << std::left << setw(12) << pascal[i].lexeme.lexeme_text << setw(8) << pascal[i].dual_type << typesign[findSignIndex(pascal[i].dual_type)] << endl; findSignIndex(pascal[i].dual_type); } } system("pause"); return 0;}
测试文件:a.txt
BEGIN464645545454VAR a b;C:=2*Pi*R;IF A >= 3 THEN A:=(2*6+1)*9ELSE A:=3*6;A++;653// 666注释/** 多行的*注释~*//*cdgvvvds*/+-*/>=<=<>:=()<>#END
运行结果: