利用PHP实现词法分析器与自定义语言
之前项目有一个需求,业务人员使用中文编写一些自定义公式,然后需要我们后台执行将结果返回到界面上,于是就基于有限状态机写了这个词法分析器,比较简单,希望能够抛砖引玉。
一、分析需求
输入中文公式,返回结果,比如:
现有薪资=10000;
个税起点=3000;
当前年份=2021;
如果(当前年份=2022){
个税起点=5000;
}
返回 (现有薪资-个税起点) * 0.2;
二、实现需求
最初的想法是使用字符串替换的方式,将中文关键字替换成 php 的关键字,然后调用 eval 执行,这样确实也是可以的,但是总觉得不是很美丽,并且不能实现动态解析。就想着自己实现一个简单的词法分析,然后结合 ast 将词法转换成 php 代码执行,岂不快哉。当前版本没有用到抽象语法树来生成代码,全部使用字符串拼接。
?1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 | <?php /** * Class Lexer * @package Sett\OaLang * 词法分析器 */ class Lexer { // 内置关键字集合 public $keywordList = []; // 内置操作符集合 public $operatorList = [ "+" , "-" , "*" , "/" , "=" , ">" , "<" , "!" , "(" , ")" , "{" , "}" , "," , ";" ]; // 源代码 private $input ; // 当前的字符 private $currChar ; // 当前字符位置 private $currCharPos = 0; // 结束符 private $eof = "eof" ; // 当前编码 private $currEncode = "UTF-8" ; // 内置关键字 public const VAR = "variable" ; public const STR = "string" ; public const KW = "keyword" ; public const OPR = "operator" ; public const INT = "integer" ; public const NIL = "null" ; /** * Lexer constructor. * @param string $input */ public function __construct(string $input ) { $this ->input = $input ; $this ->currChar = mb_substr( $this ->input, $this ->currCharPos, 1); } /** * @param array $keywordList */ public function setKeywordList( $keywordList ) { $this ->keywordList = $keywordList ; } /** * @return array * @throws Exception */ public function parseInput() { if ( $this ->input == "" ) { throw new Exception( "code can not be empty" ); } $tokens = []; do { $token = $this ->nextToken(); if ( $token [ "type" ] != "eof" ) { $tokens [] = $token ; } if ( $token [ "type" ] == self::KW) { $tokens [] = $this ->makeToken(self::NIL, " " ); } } while ( $token [ "type" ] != "eof" ); return $tokens ; } /** * @return array */ public function nextToken() { $this ->skipBlankChar(); $this ->currChar == "" && $this ->currChar = $this ->eof; if ( $this ->isCnLetter()) { $word = $this ->matchUntilNextCharIsNotCn(); if ( $this ->isKeyword( $word )) { $this ->currCharPos -= 1; return $this ->currToken( static ::KW, $word ); } // 不是关键字的全部归为变量 return $this ->makeToken( static ::VAR, $word ); } // 如果是操作符 if ( $this ->isOperator()) { return $this ->currToken( static ::OPR, $this ->currChar); } // 如果是数字 if ( $this ->isNumber()) { return $this ->currToken( static ::INT, $this ->currChar); } // 如果是字符串 if ( $str = $this ->isStr()) { return $this ->currToken( static ::STR, $str ); } // 如果是变量 if ( $this ->isVar()) { $word = $this ->matchVar(); if ( $this ->isKeyword( $word )) { return $this ->currToken( static ::KW, $word ); } return $this ->makeToken( static ::VAR, $word ); } if ( $this ->currChar == $this ->eof) { return $this ->currToken( 'eof' , $this ->currChar); } return $this ->currToken( static ::VAR, $this ->currChar); } /** * @param string $input * @return string */ private function matchVar(string $input = "" ) { $word = $input ?: '' ; while ( $this ->isVar()) { $word .= $this ->currChar; $this ->nextChar(); } return $word ; } /** * @return bool * 是否为普通变量 */ private function isVar() { return $this ->isCnLetter() || $this ->isEnLetter(); } /** * 跳过空白字符 */ private function skipBlankChar() { while (ord( $this ->currChar) == 10 || ord( $this ->currChar) == 13 || ord( $this ->currChar) == 32) { $this ->nextChar(); } } /** * @param string $type * @param $word * @return array * 记录当前token和下一个字符 */ private function currToken(string $type , $word ) { $token = $this ->makeToken( $type , $word ); $this ->nextChar(); return $token ; } /** * @param string $type * @param string $char * @return array */ private function makeToken(string $type , string $char ) { return [ "type" => $type , "char" => $char , "pos" => $this ->currCharPos]; } /** * @return bool * 判断是否是英文字符 */ private function isEnLetter() { if ( $this ->currChar == "" || $this ->currChar == $this ->eof) { return false; } $ord = mb_ord( $this ->currChar, $this ->currEncode); if ( $ord > ord( 'a' ) && $ord < ord( 'z' )) { return true; } return false; } /** * @return false|int * 是否中文字符 */ private function isCnLetter() { return preg_match( "/^[\x{4e00}-\x{9fa5}]+$/u" , $this ->currChar); } /** * @return bool * 是否为数字 */ private function isNumber() { return is_numeric ( $this ->currChar); } /** * @return bool * 是否是字符串 */ private function isStr() { return $this ->matchCompleteStr(); } /** * @return string * 匹配完整字符串 */ private function matchCompleteStr() { $char = "" ; if ( $this ->currChar == "\"" ) { $this ->nextChar(); while ( $this ->currChar != "\"" ) { if ( $this ->currChar != "\"" ) { $char .= $this ->currChar; } $this ->nextChar(); } return $char ; } return $char ; } /** * @return bool * 是否是操作符 */ private function isOperator() { return in_array( $this ->currChar, $this ->operatorList); } /** * @return string * 匹配中文字符 */ private function matchUntilNextCharIsNotCn() { $char = "" ; while ( $this ->isCnLetter()) { $char .= $this ->currChar; $this ->nextChar(); } return $char ; } /** * @return void 获取下一个字符 * 获取下一个字符 */ private function nextChar() { $this ->currCharPos += 1; $this ->currChar = mb_substr( $this ->input, $this ->currCharPos, 1); if ( $this ->currChar == "" ) { $this ->currChar = $this ->eof; } } /** * @param string $input * @return bool * 是否是关键字 */ private function isKeyword(string $input ) { return ( $this ->keywordList[ $input ] ?? "" ) != "" ; } public function convert( array $tokens ) { $code = "" ; foreach ( $this ->lexerIterator( $tokens ) as $generator ) { switch ( $generator [ "type" ]) { case static ::KW: $code .= $this ->keywordList[ $generator [ "char" ]]; break ; case static ::VAR: $code .= sprintf( "$%s" , $generator [ "char" ]); break ; case static ::OPR: $code .= $this ->replace( $generator [ "char" ]); break ; case static ::INT: $code .= $generator [ "char" ]; break ; case static ::STR: $code .= sprintf( "\"%s\"" , $generator [ "char" ]); break ; default : $code .= $generator [ "char" ]; } } return $code ; } private function replace(string $char ) { return str_replace ( "+" , "." , $char ); } /** * @param array $tokens * @return \Generator */ private function lexerIterator( array $tokens ) { foreach ( $tokens as $index => $token ) { yield $token ; } } } |
三、如何使用
?1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | require __DIR__ . "/vendor/autoload.php" ; // 定义一段代码 $code = <<<EOF 姓名= "腕豪" ; 问候= "你好啊" ; 地址=(1+2) * 3; 如果(地址 > 3){ 地址=1; }否则{ 地址= "艾欧尼亚" } 说话 = ( "我" + "爱" )+ "你" ; 返回 姓名+年龄; EOF; $lexer = new Lexer( $code ); // 自定义你的关键字 $kwMap = [ "如果" => "if" , "否则" => "else" , "返回" => "return" , "否则如果" => "elseif" ]; $lexer ->setKeywordList( $kwMap ); // 这里是生成的词 $tokens = $lexer ->parseInput(); // 将生成的词转成php,当然你也可以尝试用php-parse转ast再转成php,这里只是简单的拼接 var_dump( $lexer ->convert( $tokens )); |
生成词
?1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | [{ "type" : "variable" , "char" : "姓名" , "pos" : 2 }, { "type" : "operator" , "char" : "=" , "pos" : 2 }, { "type" : "string" , "char" : "腕豪" , "pos" : 7 }, { "type" : "operator" , "char" : ";" , "pos" : 8 }, { "type" : "variable" , "char" : "问候" , "pos" : 13 }, { "type" : "operator" , "char" : "=" , "pos" : 13 }, { "typ e" : "string" , "char" : "你好啊" , "pos" : 17 }, { "type" : "operator" , "char" : ";" , "pos" : 18 }, { "type" : "variable" , "char" : "地址" , "pos" : 23 }, { "type" : "operator" , "char" : "=" , "pos" : 23 }, { "type" : "operator" , "char" : "(" , "pos" : 24 }, { "type" : "integer" , "char" : "1" , "pos" : 25 }, { "type" : "operator" , "char" : " +" , "pos" : 26 }, { "type" : "integer" , "char" : "2" , "pos" : 27 }, { "type" : "operator" , "char" : ")" , "pos" : 28 }, { "type" : "operator" , "char" : "*" , "pos" : 30 }, { "type" : "integer" , "char" : "3" , "pos" : 32 }, { "type" : "operator" , "char" : ";" , "pos" : 33 }, { "type" : "keyword" , "char" : "如果" , "pos" : 37 }, { "type" : "nul l" , "char" : " " , "pos" : 38 }, { "type" : "operator" , "char" : "(" , "pos" : 38 }, { "type" : "variable" , "char" : "地址" , "pos" : 41 }, { "type" : "operator" , "char" : ">" , "pos" : 42 }, { "type" : "integer" , "char" : "3" , "pos" : 44 }, { "type" : "operator" , "char" : ")" , "pos" : 45 }, { "type" : "operator" , "char" : "{" , "pos" : 46 }, { "type" : "variable" , "char" : "地址" , "pos" : 55 }, { "type" : "operator" , "char" : "=" , "pos" : 55 }, { "type" : "integer" , "char" : "1" , "pos" : 56 }, { "type" : "operator" , "char" : ";" , "pos" : 57 }, { "type" : "operator" , "char" : "}" , "pos" : 60 }, { "type" : "keyword" , "char" : "否则" , "pos" : 62 }, { "type" : "null" , "char " : " " , "pos" : 63 }, { "type" : "operator" , "char" : "{" , "pos" : 63 }, { "type" : "variable" , "char" : "地址" , "pos" : 72 }, { "type" : "operator" , "char" : "=" , "pos" : 72 }, { "type" : "string" , "char" : "艾欧尼亚" , "pos" : 78 }, { "type" : "operator" , "char" : ";" , "pos" : 79 }, { "type" : "operator" , "char" : "}" , "pos" : 82 }, { "type" : "variable" , "char" : "说话" , "pos" : 87 }, { "type" : "operator" , "char" : "=" , "pos" : 88 }, { "type" : "operator" , "char" : "(" , "pos" : 90 }, { "type" : "string" , "char" : "我" , "pos" : 93 }, { "type" : "operator" , "char" : "+" , "pos" : 94 }, { "type" : "string" , "char" : "爱" , "pos" : 97 }, { "type" : "operator" , "char" : ")" , "pos" : 98 }, { "type" : "operator" , "char" : "+" , "pos" : 99 }, { "type" : "string" , "char" : "你" , "pos" : 102 }, { "type" : "operator" , "char" : ";" , "pos" : 103 }, { "type" : "keyword" , "char" : "返回" , "pos" : 107 }, { "type" : "null" , "char" : " " , "pos" : 108 }, { "type" : "variable" , "char" : "姓名" , "pos" : 111 }, { "typ e" : "operator" , "char" : "+" , "pos" : 111 }, { "type" : "variable" , "char" : "年龄" , "pos" : 114 }, { "type" : "operator" , "char" : ";" , "pos" : 114 }] |
输出:
$姓名="腕豪";$问候="你好啊";$地址=(1.2)*3;if ($地址>3){$地址=1;}else {$地址="艾欧尼亚";}$说话=("我"."爱")."你";return $姓名.$年龄;
能执行吗?当然能。还存在一些小 bug,不想改了。
四、使用场景
什么,居然有人说没什么用?oa 系统总有用到的时候。
到此这篇关于利用PHP实现词法分析器与自定义语言的文章就介绍到这了,更多相关PHP词法分析器内容请搜索服务器之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持服务器之家!
原文链接:https://mp.weixin.qq.com/s/tT50a0PDxcZSzRYh7pQFnA
1.本站遵循行业规范,任何转载的稿件都会明确标注作者和来源;2.本站的原创文章,请转载时务必注明文章作者和来源,不尊重原创的行为我们将追究责任;3.作者投稿可能会经我们编辑修改或补充。