词法分析 |Python 实现LL(1) 文法

作者: print("") 分类: 编译原理 发布时间: 2025-01-05 20:52

文法如下:

文法:
1: lexp -> atom | list
2: atom -> number | identifier
3: list -> (lexp-seq)
4: lexp-seq -> lexp seq2
5: seq2 -> lexp seq2 | ε

需要判断是字符串为(a(b(2))(c))  并且输出详细的步骤

'''
文法:
1: lexp -> atom | list
2: atom -> number | identifier
3: list -> (lexp-seq)
4: lexp-seq -> lexp seq2 
5: seq2 -> lexp seq2 | ε
'''

# Token类型 
TOKEN_NUMBER = 'NUMBER'  # number 
TOKEN_IDENTIFIER = 'IDENTIFIER'  # identifier
TOKEN_LPAREN = 'LPAREN'     # ( 
TOKEN_RPAREN = 'RPAREN'     # )
TOKEN_EOF = 'EOF'          # 结束符

class Token:
    def __init__(self, type, value=None):
        self.type = type # 类型
        self.value = value # 值

class Lexer:
    def __init__(self, text):
        self.text = text # 输入文本
        self.pos = 0
        self.current_char = self.text[self.pos] if text else None

    def error(self):
        raise Exception('词法分析错误')

    def advance(self):
        self.pos += 1 # 移动到下一个字符
        if self.pos > len(self.text) - 1:
            self.current_char = None # 到达末尾
        else:
            self.current_char = self.text[self.pos] # 更新当前字符

    def skip_whitespace(self):
        while self.current_char and self.current_char.isspace():
            self.advance() # 跳过空白字符

    def number(self):
        result = ''
        while self.current_char and self.current_char.isdigit():
            result += self.current_char
            self.advance()
        return int(result) # 返回整数

    def identifier(self):
        result = ''
        while self.current_char and self.current_char.isalnum():
            result += self.current_char
            self.advance()
        return result # 返回标识符

    def get_next_token(self):
        while self.current_char:
            if self.current_char.isspace():
                self.skip_whitespace() # 跳过空白字符
                continue

            if self.current_char.isdigit():
                return Token(TOKEN_NUMBER, self.number()) # 返回数字    

            if self.current_char.isalpha():
                return Token(TOKEN_IDENTIFIER, self.identifier()) # 返回标识符          

            if self.current_char == '(':
                self.advance()
                return Token(TOKEN_LPAREN, '(') # 返回左括号    

            if self.current_char == ')':
                self.advance()
                return Token(TOKEN_RPAREN, ')') # 返回右括号

            self.error()

        return Token(TOKEN_EOF) # 返回结束符

class Parser:
    def __init__(self, lexer):
        self.lexer = lexer
        self.current_token = self.lexer.get_next_token()
        self.step = 1  # 添加步骤计数
        
        # 构建First集
        self.FIRST = {
            'lexp': {TOKEN_NUMBER, TOKEN_IDENTIFIER, TOKEN_LPAREN}, # lexp -> atom | list   
            'atom': {TOKEN_NUMBER, TOKEN_IDENTIFIER}, # atom -> number | identifier
            'list': {TOKEN_LPAREN}, # list -> (lexp-seq)
            'lexp-seq': {TOKEN_NUMBER, TOKEN_IDENTIFIER, TOKEN_LPAREN}, # lexp-seq -> lexp seq2
            'seq2': {TOKEN_NUMBER, TOKEN_IDENTIFIER, TOKEN_LPAREN, 'ε'} # seq2 -> lexp seq2 | ε
        }
        
        # 构建Follow集
        self.FOLLOW = {
            'lexp': {TOKEN_RPAREN, TOKEN_EOF, TOKEN_NUMBER, TOKEN_IDENTIFIER, TOKEN_LPAREN}, # lexp -> atom | list
            'atom': {TOKEN_RPAREN, TOKEN_EOF, TOKEN_NUMBER, TOKEN_IDENTIFIER, TOKEN_LPAREN}, # atom -> number | identifier
            'list': {TOKEN_RPAREN, TOKEN_EOF, TOKEN_NUMBER, TOKEN_IDENTIFIER, TOKEN_LPAREN}, # list -> (lexp-seq)
            'lexp-seq': {TOKEN_RPAREN}, # lexp-seq -> lexp seq2
            'seq2': {TOKEN_RPAREN} # seq2 -> lexp seq2 | ε
        }
        
        # 构建预测分析表
        self.PREDICT = {
            ('lexp', TOKEN_NUMBER): 'atom', # lexp -> atom
            ('lexp', TOKEN_IDENTIFIER): 'atom', # lexp -> atom
            ('lexp', TOKEN_LPAREN): 'list', # lexp -> list
            
            ('atom', TOKEN_NUMBER): 'number', # atom -> number
            ('atom', TOKEN_IDENTIFIER): 'identifier', # atom -> identifier
            
            ('list', TOKEN_LPAREN): '(lexp-seq)',
            
            ('lexp-seq', TOKEN_NUMBER): 'lexp seq2', # lexp-seq -> lexp seq2        
            ('lexp-seq', TOKEN_IDENTIFIER): 'lexp seq2', # lexp-seq -> lexp seq2
            ('lexp-seq', TOKEN_LPAREN): 'lexp seq2', # lexp-seq -> lexp seq2
            
            ('seq2', TOKEN_NUMBER): 'lexp seq2', # seq2 -> lexp seq2
            ('seq2', TOKEN_IDENTIFIER): 'lexp seq2', # seq2 -> lexp seq2
            ('seq2', TOKEN_LPAREN): 'lexp seq2', # seq2 -> lexp seq2
            ('seq2', TOKEN_RPAREN): 'ε' # seq2 -> ε
        }

    def predict(self, non_terminal):
        """使用预测分析表进行分析"""
        key = (non_terminal, self.current_token.type)
        if key not in self.PREDICT:
            self.error()
        return self.PREDICT[key]

    def error(self):
        raise Exception('语法分析错误')

    def eat(self, token_type, stack=''):
        if self.current_token.type == token_type:
            # 显示终结符匹配
            remaining_input = self.get_remaining_input()
            match_str = f"匹配 {self.current_token.value}" if self.current_token.value else f"匹配 {token_type}"
            self.print_step(stack, remaining_input, match_str)
            
            self.current_token = self.lexer.get_next_token()
        else:
            self.error()

    def print_step(self, stack, input_str, action):
        """打印当前分析步骤"""
        # 只保留栈的最后一个非终结符
        stack_parts = stack.split()
        short_stack = stack_parts[-1] if stack_parts else ''
        
        # 缩短输入显示,只显示前15个字符
        short_input = input_str[:15] + ('...$' if len(input_str) > 15 else '')
        
        print(f"{self.step:<3} {short_stack:<15} {short_input:<15} {action}")
        self.step += 1

    def get_remaining_input(self):
        """获取剩余输入"""
        pos = self.lexer.pos - 1 if self.current_token.type != TOKEN_EOF else self.lexer.pos
        return self.lexer.text[pos:] + '$'

    # lexp -> atom | list
    def lexp(self, stack=''):
        current_stack = f"{stack}lexp"
        remaining_input = self.get_remaining_input()
        
        production = self.predict('lexp')
        if production == 'atom':
            self.print_step(current_stack, remaining_input, 'lexp -> atom')
            self.atom(current_stack)
        elif production == 'list':
            self.print_step(current_stack, remaining_input, 'lexp -> list')
            self.list(current_stack)
        else:
            self.error()

    # atom -> number | identifier
    def atom(self, stack=''):
        current_stack = f"{stack} atom"
        remaining_input = self.get_remaining_input()
        
        if self.current_token.type == TOKEN_NUMBER:
            self.print_step(current_stack, remaining_input, 'atom -> number')
            self.eat(TOKEN_NUMBER, current_stack)  # 传入当前栈
        elif self.current_token.type == TOKEN_IDENTIFIER:
            self.print_step(current_stack, remaining_input, 'atom -> identifier')
            self.eat(TOKEN_IDENTIFIER, current_stack)  # 传入当前栈
        else:
            self.error()

    # list -> (lexp-seq)
    def list(self, stack=''):
        current_stack = f"{stack} list"
        remaining_input = self.get_remaining_input()
        self.print_step(current_stack, remaining_input, 'list -> (lexp-seq)')
        
        self.eat(TOKEN_LPAREN, current_stack)  # 传入当前栈
        self.lexp_seq(current_stack)
        self.eat(TOKEN_RPAREN, current_stack)  # 传入当前栈

    # lexp-seq -> lexp seq2
    def lexp_seq(self, stack=''):
        current_stack = f"{stack} (lexp-seq)"
        remaining_input = self.get_remaining_input()
        self.print_step(current_stack, remaining_input, 'lexp-seq -> lexp seq2')
        
        self.lexp(current_stack)
        self.seq2(current_stack)

    # seq2 -> lexp seq2 | ε
    def seq2(self, stack=''):
        current_stack = f"{stack} seq2"
        remaining_input = self.get_remaining_input()
        
        production = self.predict('seq2')
        if production == 'lexp seq2':
            self.print_step(current_stack, remaining_input, 'seq2 -> lexp seq2')
            self.lexp(current_stack)
            self.seq2(current_stack)
        elif production == 'ε':
            self.print_step(current_stack, remaining_input, 'seq2 -> ε')
            return
        else:
            self.error()

    def parse(self):
        print(f"{'步骤':<3} {'栈':<15} {'输入':<15} {'动作'}")
        print("-" * 50)
        self.lexp()
        if self.current_token.type != TOKEN_EOF:
            self.error()
        return True

def main():
    # 测试用例
    test = "(a(b(2))(c))"  
    print(f"\n分析输入: {test}\n")
    
    try:
        lexer = Lexer(test)
        parser = Parser(lexer)
        result = parser.parse()
        print("\n语法分析结果: 合法")
    except Exception as e:
        print(f"\n语法分析结果: 不合法 ({str(e)})")

if __name__ == '__main__':
    main()


执行效果

如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注