在了解词法分析之前,先看看对单词的定义:
/// <summary>/// A structure for the result of word parsing./// </summary>public class Word{ public int AbsoluteLine; public int AbsoluteStartPos; public int AbsoluteEndPos; public int Offset; public int Column; public string Text; public bool IsAvailable; public Context.LocationInfo Location; }
注意为了方便计,实现中并未将所有词语都标记为单词,比如操作符。仅有关键字、变量名和函数名被标记为单词。
为了更好的分析源代码,创建了一个SourceCode类,以进行词法分析工作:
public class SourceCode{ //各种标记 public static char[] MultiLineCommentsStartMark = new char[] { '/', '*' }; public static char[] HexMark = new char[] { 'x', 'X' }; public static char[] ExponentialMark = new char[] { 'e', 'E' }; public static char[] FloatMark = new char[] { 'f', 'F' }; public static char[] PlusMinusMark = new char[] { '-', '+' }; public string Text = ""; public int Line = 0; public int ColumnOfCurrentLine = 0; public int PosOffset = 0; public int LineOffset = 0; public int Column; public Word LastWord; public SourceCode() { } public SourceCode(string txt); public void LoadFromFile(string path); public bool Eof; // 当前位置在整个代码中的绝对位置 public int AbsolutePos; // 当前行在整个代码中的绝对行 public int AbsoluteLine; public Context.LocationInfo Location; public char CurrentChar; // 重置位置索引(局部位置索引) public void ResetPos(); // 检测下一个字符是否与指定的字符匹配 public bool TestNextChar(char c); // 检测下一个字符是否在指定的字符集中 public bool TestNextChar(char[] chars); // 获取下一个字符,默认会跳过前面的空格 public void NextChar(bool skipSpace = true); // 获取从当前位置开始的剩余代码 public string Tail; public static bool IsDigit(char ch); public static bool IsLetter(char ch); public static bool IsSpace(char ch); public static bool IsOperator(char ch); public static bool IsBracket(char ch); public void SkipSpace(); // 以';'为分隔符,划分代码片断 public List<SourceCode> SplitStatement(); // 以','为分隔符,划分多个变量定义片断 public List<SourceCode> SplitMultiDeclaration(); // 划分参数 public List<SourceCode> SplitParameter(); // 获取一个单词 public static Word GetWord(SourceCode src); // 获取括号内的代码 public static SourceCode GetBracketCode(char leftBracket, char rightBracket, SourceCode src);}
更复杂的分析,与语法分析一道,整合到Parser类中。
Parser类用于对C代码进行语法分析并构造语法树。
/// <summary>/// Parser class parses the simple C source code to build/// the syntax serial./// </summary>public class Parser{ public enum ParsingErrorType { Warning, UnknownError, UndefinedVariable, UnexpectedKeyword, TokenExpected, SyntaxError, TypeError, FormatError, FunctionBodyUndefined }; public class ParsingEventArg { public Context Context; } public class ParsingWarningEventArg { public Context Context; public Context.LocationInfo Location; public string Description; } public class ParsingErrorEventArg { public Context Context; public Context.LocationInfo Location; public ParsingErrorType Error; public string Description; public bool Continue; } public event EventHandler<ParsingEventArg> OnParsing; public event EventHandler<ParsingWarningEventArg> OnParsingWarning; public event EventHandler<ParsingErrorEventArg> OnParsingError; /////////////////////////////// // Private member variables /////////////////////////////// private SourceCode m_sourceCode = null; private Word m_lastWord = null; private Expression.ExpressionNode m_lastExpNode = null; private Dictionary<char, char> m_escapeCharDict = new Dictionary<char, char>(); private int m_errorCount = 0; private int m_warningCount = 0; public String Source { get { return m_sourceCode.Text; } } public bool HasError { get { return m_errorCount > 0; } } public bool HasWarning { get { return m_warningCount > 0; } } public int ErrorCount { get { return m_errorCount; } } public int WarningCount { get { return m_warningCount; } } public int MaxError = 0; public int MaxWarning = 65535; public Context Parse(SourceCode src) { m_sourceCode = src; Context ctx = new Context(); if (Parse(ctx, src)) { //语法分析完成后,校验是否有未使用的变量或函数,以及对申明而未定义函数的使用 if (ValidateContextReference(ctx)) return ctx; } return null; } ...
语法分析的主要方法定义如下:
private bool Parse(Context ctx, SourceCode src){ bool res = true; ctx.Location.FirstLine = src.AbsoluteLine; ctx.Location.FirstPos = src.AbsolutePos; foreach (SourceCode stmt in src.SplitStatement()) // 逐语句进行处理 { try { // 检测do...while的while部分 if (ctx.Children.Count > 0 && ctx.Children.Last() is ControlFlow.DoWhileLoop) { if ((ctx.Children.Last() as ControlFlow.DoWhileLoop).Condition == null) { Word wordOfWhile = GetWord(stmt); if (wordOfWhile.Text != "while") { if (!NotifyError(ctx, wordOfWhile.Location, ParsingErrorType.SyntaxError, "\"while\" is expected.")) return false; } else { res = ParseControl_While(ctx, stmt, new Context.LocationInfo() { FirstLine = wordOfWhile.AbsoluteLine, FirstPos = wordOfWhile.AbsoluteStartPos }); if (!res) return false; else continue; } } } if (stmt.Text.EndsWith(";")) // 语句 { res = ParseStatement(ctx, stmt); } else { if (stmt.Text.EndsWith("}")) // 函数体或块 { if (stmt.Text.StartsWith("{")) // 块 { SourceCode blockSrc = new SourceCode() { LineOffset = stmt.AbsoluteLine, PosOffset = stmt.AbsolutePos, Text = stmt.Text.Substring(1, stmt.Text.Length - 2) }; Block block = new Block() { Name = Context.GetAnonymousName("block"), Location = new Context.LocationInfo() { FirstLine = stmt.AbsoluteLine, FirstPos = stmt.AbsolutePos } }; ctx.AddChild(block); res = Parse(block, blockSrc); block.Location.LastLine = stmt.AbsoluteLine; block.Location.LastPos = stmt.AbsolutePos; } else { // 函数 // 过滤控制结构 Word wordOfControlFlow = GetWord(stmt); if (Context.IsControlFlow(wordOfControlFlow.Text)) { res = ParseControlFlow(ctx, stmt, wordOfControlFlow); } else { stmt.ResetPos(); res = ParseFunction(ctx, stmt, wordOfControlFlow.Location); } } } } } catch (ParseException pe) { if (!NotifyError(ctx, ctx.Location, ParsingErrorType.SyntaxError, pe.Message)) return false; } if (!res) return false; } // for ctx.Location.LastLine = src.AbsoluteLine; ctx.Location.LastPos = src.AbsolutePos; return true;}
语句处理,分三种情况:申明、控制结构及表达式。
private bool ParseStatement(Context ctx, SourceCode src){ Word firstWord = GetWord(src); if (Context.IsDataType(firstWord.Text)) // 以类型打头 { //变量或函数申明 return ParseDeclare(ctx, src, firstWord); } else if (Context.IsControlFlow(firstWord.Text)) // 控制结构 { //Control return ParseControlFlow(ctx, src, firstWord); } else { // 表达式 src.ResetPos(); return ParseExpression(ctx, src, firstWord.Location); }}
函数解析的实现如下:
private bool ParseFunction(Context ctx, SourceCode src, Context.LocationInfo loc){ // 定位函数体 while (!src.Eof && src.CurrentChar != '{') src.NextChar(); // 头部位置信息 Context.LocationInfo headerLoc = loc; headerLoc.LastPos = src.AbsolutePos - 1; // 函数头部 SourceCode funcHeader = new SourceCode() { PosOffset = loc.FirstPos, LineOffset = loc.FirstLine, Text = src.Text.Substring(0, src.Column) }; // 解析头部 // 如成功, 一个FunctionDefine将被添加到当前Context的尾 if (!ParseStatement(ctx, funcHeader)) return false; src.NextChar(); // skip '{' // 函数体 SourceCode bodyStmt = new SourceCode() { PosOffset = src.AbsolutePos, LineOffset = src.AbsoluteLine, Text = src.Text.Substring(src.Column, src.Text.Length - src.Column - 1) }; // 函数对象 Function.FunctionDefine funcDef = ctx.Children.Last() as Function.FunctionDefine; funcDef.AddChild(new Block() { Name = Context.GetAnonymousName("block") }); // 递归解析函数体 if (Parse(funcDef.Body, bodyStmt)) { funcDef.Location = headerLoc; return true; } return false;}控制结构的解析如下:private bool ParseControlFlow(Context ctx, SourceCode src, Word wordOfControlFlow){ bool res = false; switch (wordOfControlFlow.Text) { case "if": res = ParseControl_If(ctx, src, wordOfControlFlow.Location); break; case "else": res = ParseControl_Else(ctx, src, wordOfControlFlow.Location); break; case "for": res = ParseControl_For(ctx, src, wordOfControlFlow.Location); break; case "do": res = ParseControl_DoWhile(ctx, src, wordOfControlFlow.Location); break; case "while": res = ParseControl_While(ctx, src, wordOfControlFlow.Location); break; case "switch": res = ParseControl_Switch(ctx, src, wordOfControlFlow.Location); break; case "continue": res = ParseControl_Continue(ctx, src, wordOfControlFlow.Location); break; case "break": res = ParseControl_Break(ctx, src, wordOfControlFlow.Location); break; case "return": res = ParseControl_Return(ctx, src, wordOfControlFlow.Location); break; default: { // Unsupported control flow. if (!NotifyError(ctx, wordOfControlFlow.Location, ParsingErrorType.SyntaxError, "Unsupported keyword.")) return false; } break; } // switch if (res) NotifyParsing(ctx.Children.Last()); return res;}
以if/else为例,说明控制结构的解析过程。
先看if部分:
private bool ParseControl_If(Context ctx, SourceCode src, Context.LocationInfo loc){ src.SkipSpace(); // get '(' if (src.CurrentChar != '(') if (!NotifyError(ctx, GetLocation(loc.FirstLine, loc.FirstPos, src.AbsoluteLine, src.AbsolutePos), ParsingErrorType.SyntaxError, "'(' is expected.")) return false; // 获取判断表达式 SourceCode condition = GetParenthesisCode(src); ControlFlow.IfThen stxIf = new ControlFlow.IfThen() { Location = new Context.LocationInfo() { FirstLine = loc.FirstLine, FirstPos = loc.FirstPos } }; ctx.AddChild(stxIf); //解析判断表达式 if (!ParseExpression(stxIf, condition, ref stxIf.Condition)) return false; // 尝试解析then部分代码 src.SkipSpace(); bool res = false; Block ThenBlock = new Block(); stxIf.AddChild(ThenBlock); if (src.CurrentChar == '{') //块? { SourceCode code = GetBlockCode(src); res = Parse(ThenBlock, code); stxIf.Location.LastLine = src.AbsoluteLine; stxIf.Location.LastPos = src.AbsolutePos; } else { // 单个语句? SourceCode stmt = new SourceCode() { PosOffset = src.AbsolutePos, LineOffset = src.AbsoluteLine, Text = src.Text.Substring(src.Column) }; res = Parse(ThenBlock, stmt); stxIf.Location.LastLine = stmt.AbsoluteLine; stxIf.Location.LastPos = stmt.AbsolutePos; } // else另案处理 return res;}
再看else部分:
private bool ParseControl_Else(Context ctx, SourceCode src, Context.LocationInfo loc){ // else 不能单独出现,前面必须有个if。这种处理方式也解决了else与if的就近匹配问题。 if (!(ctx.Children.Last() is ControlFlow.IfThen)) if (!NotifyError(ctx, loc, ParsingErrorType.SyntaxError, "\"else\" should not appear here.")) return false; // 上一个语法可能是多重if/then Context lastStx = ctx.Children.Last(); while (lastStx.Children.Count > 2) // Children数大于2,表示具有else部分,则尝试取得最后一个if/then { lastStx = lastStx.Children.Last(); } // 再次检测if/then if (!(lastStx is ControlFlow.IfThen)) if (!NotifyError(ctx, loc, ParsingErrorType.SyntaxError, "Can't find matched \"if\".")) return false; ControlFlow.IfThen stxIf = lastStx as ControlFlow.IfThen; src.SkipSpace(); bool res = false; Block elseBlock = new Block(); stxIf.AddChild(elseBlock); // Block if (src.CurrentChar == '{') { SourceCode code = GetBlockCode(src); res = Parse(elseBlock, code); lastStx.Location.LastLine = src.AbsoluteLine; lastStx.Location.LastPos = src.AbsolutePos; } else { // Statement SourceCode stmt = new SourceCode() { PosOffset = src.AbsolutePos, LineOffset = src.AbsoluteLine, Text = src.Text.Substring(src.Column) }; res = Parse(elseBlock, stmt); lastStx.Location.LastLine = stmt.AbsoluteLine; lastStx.Location.LastPos = stmt.AbsolutePos; } return res;}
for循环的解析:
private bool ParseControl_For(Context ctx, SourceCode src, Context.LocationInfo loc){ src.SkipSpace(); if (src.CurrentChar != '(') if (!NotifyError(ctx, GetLocation(loc.FirstLine, loc.FirstPos, src.AbsoluteLine, src.AbsolutePos), ParsingErrorType.SyntaxError, "'(' is expected.")) return false; // 括号内的代码 SourceCode stmt = GetParenthesisCode(src); List<SourceCode> stmtList = stmt.SplitStatement(); // 三部分,必须的 if (stmtList.Count != 3) if (!NotifyError(ctx, GetLocation(loc.FirstLine, loc.FirstPos, src.AbsoluteLine, src.AbsolutePos), ParsingErrorType.SyntaxError, "Syntax error.")) return false; ControlFlow.ForLoop stxFor = new ControlFlow.ForLoop() { Location = new Context.LocationInfo() { FirstLine = loc.FirstLine, FirstPos = loc.FirstPos } }; ctx.AddChild(stxFor); // 初始化 Context stxInit = new Context(); stxFor.AddChild(stxInit); if (!ParseStatement(stxInit, stmtList[0])) return false; // 条件判断 if (!ParseExpression(stxInit, stmtList[1], ref stxFor.Condition)) return false; // 迭代器 if (!ParseExpression(stxInit, stmtList[2], ref stxFor.Iterator)) return false; src.SkipSpace(); // 循环体 if (src.CurrentChar == '{') { stmt = GetBlockCode(src); } else { stmt = new SourceCode() { PosOffset = src.AbsolutePos, LineOffset = src.AbsoluteLine, Text = src.Text.Substring(src.Column) }; } Block block = new Block(); stxFor.AddChild(block); bool res = Parse(block, stmt); stxFor.Location.LastLine = stmt.AbsoluteLine; stxFor.Location.LastPos = stmt.AbsolutePos; return res;}
复杂的是switch:
private bool ParseControl_Switch(Context ctx, SourceCode src, Context.LocationInfo loc){ // Check condition src.SkipSpace(); if (src.CurrentChar != '(') if (!NotifyError(ctx, GetLocation(loc.FirstLine, loc.FirstPos, src.AbsoluteLine, src.AbsolutePos), ParsingErrorType.SyntaxError, "Expecte a '('.")) return false; ControlFlow.Switch stxSwitch = new ControlFlow.Switch() { Location = new Context.LocationInfo() { FirstLine = loc.FirstLine, FirstPos = loc.FirstPos } }; ctx.AddChild(stxSwitch); // Parse condition expression if (!ParseExpression(stxSwitch, GetParenthesisCode(src), ref stxSwitch.Condition)) { ctx.Children.RemoveAt(ctx.Children.Count - 1); return false; } // Add body stxSwitch.AddChild(new Block()); ControlFlow.Case stxDefault = new ControlFlow.Case(); // default part stxSwitch.Body.AddChild(stxDefault); // Check '{' src.SkipSpace(); if (src.CurrentChar != '{') if (!NotifyError(ctx, GetLocation(loc.FirstLine, loc.FirstPos, src.AbsoluteLine, src.AbsolutePos), ParsingErrorType.SyntaxError, "Expecte a '{'.")) return false; // Parse body SourceCode switchBodyStmt = GetBlockCode(src); Dictionary<object, bool> caseValDict = new Dictionary<object, bool>(); while (!switchBodyStmt.Eof) { switchBodyStmt.SkipSpace(); Word word = GetWord(switchBodyStmt); switch(word.Text) { case "case": { ControlFlow.Case stxCase = new ControlFlow.Case() { Location = new Context.LocationInfo() { FirstLine = word.AbsoluteLine, FirstPos = word.AbsoluteStartPos } }; switchBodyStmt.SkipSpace(); if (switchBodyStmt.CurrentChar == '\'') // char { Expression.Operand.Value charVal = GetCharValue(stxSwitch, switchBodyStmt); if (charVal == null) if (!NotifyError(ctx, GetLocation(word.AbsoluteLine, word.AbsoluteStartPos, switchBodyStmt.AbsoluteLine, switchBodyStmt.AbsolutePos), ParsingErrorType.SyntaxError, "Expecte a expression.")) return false; stxCase.Value = charVal; } else { if (SourceCode.IsDigit(switchBodyStmt.CurrentChar)) // number { stxCase.Value = GetNumberValue(ctx, switchBodyStmt); if ((stxCase.Value.GetTypeInfo(ctx).Type & PrimitiveDataType.BaseTypeMask) == PrimitiveDataType.FloatType) if (!NotifyError(ctx, GetLocation(word.AbsoluteLine, word.AbsoluteStartPos, switchBodyStmt.AbsoluteLine, switchBodyStmt.AbsolutePos), ParsingErrorType.SyntaxError, "Expression must be an integral constant value.")) return false; } else if (!NotifyError(ctx, GetLocation(word.AbsoluteLine, word.AbsoluteStartPos, switchBodyStmt.AbsoluteLine, switchBodyStmt.AbsolutePos), ParsingErrorType.SyntaxError, "Expression must have a constant value.")) return false; } if (caseValDict.ContainsKey(stxCase.Value.AsInt)) { if (!NotifyError(ctx, GetLocation(word.AbsoluteLine, word.AbsoluteStartPos, switchBodyStmt.AbsoluteLine, switchBodyStmt.AbsolutePos), ParsingErrorType.SyntaxError, "Case label value has appeared in this switch.")) return false; } else caseValDict.Add(stxCase.Value.AsInt, true); stxSwitch.Body.AddChild(stxCase); // Parse case body switchBodyStmt.SkipSpace(); if (switchBodyStmt.CurrentChar != ':') if (!NotifyError(ctx, GetLocation(word.AbsoluteLine, word.AbsoluteStartPos, switchBodyStmt.AbsoluteLine, switchBodyStmt.AbsolutePos), ParsingErrorType.SyntaxError, "identifier is undefined.")) return false; switchBodyStmt.NextChar(); // skip ':' stxCase.Location.LastLine = switchBodyStmt.AbsoluteLine; stxCase.Location.LastPos = switchBodyStmt.AbsolutePos; NotifyParsing(stxCase); stxCase.AddChild(new Block()); if (!Parse(stxCase.Body, GetCaseBodyCode(ctx, switchBodyStmt))) return false; } break; case "default": { if (stxDefault.Children.Count > 0) if (!NotifyError(ctx, GetLocation(word.AbsoluteLine, word.AbsoluteStartPos, switchBodyStmt.AbsoluteLine, switchBodyStmt.AbsolutePos), ParsingErrorType.SyntaxError, "default label has already appeared in this switch.")) return false; // Check ':' switchBodyStmt.SkipSpace(); if (switchBodyStmt.CurrentChar != ':') if (!NotifyError(ctx, GetLocation(word.AbsoluteLine, word.AbsoluteStartPos, switchBodyStmt.AbsoluteLine, switchBodyStmt.AbsolutePos), ParsingErrorType.SyntaxError, "':' is expected.")) return false; switchBodyStmt.NextChar(); // skip ':' stxDefault.AddChild(new Block() { Location = new Context.LocationInfo() { FirstLine = word.AbsoluteLine, FirstPos = word.AbsoluteStartPos, LastLine = src.AbsoluteLine, LastPos = src.AbsolutePos } }); if (!Parse(stxDefault.Body, GetCaseBodyCode(ctx, switchBodyStmt))) return false; } break; default: if (!NotifyError(ctx, GetLocation(word.AbsoluteLine, word.AbsoluteStartPos, switchBodyStmt.AbsoluteLine, switchBodyStmt.AbsolutePos), ParsingErrorType.SyntaxError, "\"case\" is expected but get \"" + word.Text + "\".")) return false; break; } } // while stxSwitch.Body.RemoveChild(stxDefault); stxSwitch.Body.AddChild(stxDefault); stxSwitch.Location.LastLine = src.AbsoluteLine; stxSwitch.Location.LastPos = src.AbsolutePos; return true;}
表达式解析分为两步,第一步先生成Statement对象,再以此对象为基础进行解析。
第一步:
private bool ParseExpression(Context ctx, SourceCode src, Context.LocationInfo loc){ Statement stmtStx = new Statement() { Name = Context.GetAnonymousName("statement"), Location = loc }; if (!ParseExpression(ctx, src, ref stmtStx.TargetExpression)) return false; ctx.AddChild(stmtStx); stmtStx.Location.LastLine = src.AbsoluteLine; stmtStx.Location.LastPos = src.AbsolutePos; NotifyParsing(stmtStx); return true;}
第二步:
private bool ParseExpression(Context ctx, SourceCode src, ref Expression.ExpressionNode expTree){ bool res = true; while (!src.Eof && res) { src.SkipSpace(); switch (src.CurrentChar) { case ',': { src.NextChar(); // skip ',' Statement stxExp = new Statement(); if (ctx.Parent != null) ctx.Parent.AddChild(stxExp); else ctx.AddChild(stxExp); res = ParseExpression(ctx, src, ref stxExp.TargetExpression); } break; case ';': src.NextChar(); break; // End of statement case '=': res = ParseExpression_Equal(ctx, src, ref expTree); break; case '+': res = ParseExpression_Plus(ctx, src, ref expTree); break; case '-': res = ParseExpression_Minus(ctx, src, ref expTree); break; case '*': res = ParseExpression_Mul(ctx, src, ref expTree); break; case '/': res = ParseExpression_Div(ctx, src, ref expTree); break; case '%': res = ParseExpression_Mod(ctx, src, ref expTree); break; case '&': res = ParseExpression_And(ctx, src, ref expTree); break; case '|': res = ParseExpression_Or(ctx, src, ref expTree); break; case '^': res = ParseExpression_Xor(ctx, src, ref expTree); break; case '!': res = ParseExpression_Not(ctx, src, ref expTree); break; case '~': res = ParseExpression_BitwiseNot(ctx, src, ref expTree); break; case '<': res = ParseExpression_Less(ctx, src, ref expTree); break; case '>': res = ParseExpression_Greater(ctx, src, ref expTree); break; case '(': res = ParseExpression_Parentheses(ctx, src, ref expTree); break; case '\'': res = ParseExpression_CharValue(ctx, src, ref expTree); break; case '"': { // const string res = ParseExpression_ConstStringValue(ctx, src, ref expTree); //if (!FireParsingFailedEvent(ctx, src, ParsingErrorType.SyntaxError, "String is not supported.")) // return false; } break; default: { Expression.ExpressionNode lastNode = m_lastExpNode; if (SourceCode.IsDigit(src.CurrentChar)) { res = ParseExpression_NumberValue(ctx, src, ref expTree); } else if (SourceCode.IsLetter(src.CurrentChar)) { res = ParseExpression_Var(ctx, src, ref expTree); } else if (!NotifyError(ctx, src.Location, ParsingErrorType.SyntaxError, "Syntax error.")) return false; if (!ValidateOperator(ctx, src, lastNode)) return false; } break; } // switch } // while !Eof return res;} // func ParseExpression