devosoft · SensibleSalmon · May 30, 2018 · May 30, 2018
diff --git a/refactor_notes.md b/refactor_notes.md
@@ -0,0 +1,17 @@
+# Evil Refactor Plan
+
+## Two parsers
+
+We will have a parser for the preprocessor language, since that's mostly separate from the rest of what we have to care about, and a parser for c++ that might contain macro calls and such. This should reduce complexity in both parsers.
+
+## Handling lines
+
+Since c++ statements can span multiple lines we're just going to chunk files between their preprocessor directives. Everything inbetween a directive gets ingested at once, which should let us handle multiline statements without too much trouble.
+
+## list structures
+
+We'll consume braces, brackets, and parens structures, since they can be used to group macro arguments. We need to make sure we can write them out relatively unchanged if they aren't relevant to macros.
+
+## whitespace
+
+binary, there-or-not-there; will still complicate parse structures a little, sadly.
diff --git a/src/pepper/__init__.py b/src/pepper/__init__.py
@@ -8,6 +8,8 @@
 from . import abstract_symbol_tree  # NOQA
 from . import parser   # NOQA
 
+from . import preprocessor_language_lexer  # NOQA
+
 from ._version import get_versions
 __version__: str = get_versions()['version']  # type: ignore
 del get_versions
diff --git a/src/pepper/lexer.py b/src/pepper/lexer.py
@@ -103,7 +103,7 @@ def t_PREPROCESSING_KEYWORD_ENDIF(t: lex.LexToken) -> lex.LexToken:
 
 
 def t_PREPROCESSING_KEYWORD_IF(t: lex.LexToken) -> lex.LexToken:
-    r'\#if\b'
+    r'\#\s?if\b'
     return t
 
 
@@ -153,7 +153,9 @@ def t_IDENTIFIER(t: lex.LexToken) -> lex.LexToken:
 
 
 def t_INT_LITERAL(t: lex.LexToken) -> lex.LexToken:
-    r'[0-9]+'
+    r'[0-9]+L?'
+    if t.value[-1] == 'L':
+        t.value = t.value[:-1]
     return t
 
 

diff --git a/src/pepper/preprocessor_language_lexer.py b/src/pepper/preprocessor_language_lexer.py
@@ -0,0 +1,204 @@
+#! /usr/bin/env python3
+
+# This file is a part of the Pepper project, https://github.com/devosoft/Pepper
+# (C) Michigan State University, under the MIT License
+# See LICENSE.txt for more information
+
+"""
+This module handles the lexing of the C/C++ preprocessing language
+"""
+
+import sys
+import ply.lex as lex
+# from ply.lex.LexToken import lex.LexToken
+import argparse
+import pepper.symbol_table as symtable
+from typing import List, Union
+
+
+DEFAULT_LITERALS = ['+', '-', '*', '/', '|', '&', '(',
+                    ')', '=', ',', '{', '}', '[', ']',
+                    '.', ';', '!', '<', '>', ':', '~',
+                    '^', '@', '#', '&', "'", '%', "?", "\\"]
+
+literals = DEFAULT_LITERALS
+
+PREPROCESSING_KEYWORDS = [
+    'include',
+    'define',
+    'ifdef',
+    'ifndef',
+    'endif',
+    'else',
+    'if',
+    'py',
+    'error',
+    'warning',
+    'pragma'
+]
+
+
+tokens = [
+    'BOOL_AND',
+    'BOOL_OR',
+    'CHAR_LITERAL',
+    'COMP_EQU',
+    'COMP_GTE',
+    'COMP_LTE',
+    'COMP_NEQU',
+    'DEFINED',
+    'IDENTIFIER',
+    'INT_LITERAL',
+    'L_SHIFT',
+    'LONG_COMMENT',
+    'NEWLINE',
+    'OTHER',
+    'PREPROCESSING_NUMBER',
+    'PUNCTUATOR',
+    'R_SHIFT',
+    'STRING_LITERAL',
+    'SYSTEM_INCLUDE_LITERAL',
+    'WHITESPACE',
+] + [f"PREPROCESSING_KEYWORD_{keyword.upper()}" for keyword in PREPROCESSING_KEYWORDS]
+
+
+def t_IDENTIFIER(t: lex.LexToken) -> lex.LexToken:
+    r'([_a-zA-Z][_a-zA-Z0-9]*(\.\.\.)?)|(\.\.\.)'
+
+    if t.value in PREPROCESSING_KEYWORDS:
+        t.type = f"PREPROCESSING_KEYWORD_{t.value.upper()}"
+
+    return t
+
+
+def t_INT_LITERAL(t: lex.LexToken) -> lex.LexToken:
+    r'[0-9]+L?'
+    if t.value[-1] == 'L':
+        t.value = t.value[:-1]
+    return t
+
+
+def t_PREPROCESSING_NUMBER(t: lex.LexToken) -> lex.LexToken:
+    r'\.?[0-9]([0-9]|(e\+)|(e\-)|(E\+)|(E\-)|(p\+)|(p\-)|(P\+)|(P\-)|[a-zA-Z])*'
+    return t
+
+
+def t_SYSTEM_INCLUDE_LITERAL(t: lex.LexToken) -> lex.LexToken:
+    r"""<[^\'\"<>]*?>"""
+    return t
+
+
+def t_COMP_LTE(t: lex.LexToken) -> lex.LexToken:
+    r"<="
+    return t
+
+
+def t_COMP_GTE(t: lex.LexToken) -> lex.LexToken:
+    r">="
+    return t
+
+
+def t_COMP_EQU(t: lex.LexToken) -> lex.LexToken:
+    r"=="
+    return t
+
+
+def t_COMP_NEQU(t: lex.LexToken) -> lex.LexToken:
+    r"!="
+    return t
+
+
+def t_BOOL_AND(t: lex.LexToken) -> lex.LexToken:
+    r"&&"
+    return t
+
+
+def t_BOOL_OR(t: lex.LexToken) -> lex.LexToken:
+    r"\|\|"
+    return t
+
+
+def t_L_SHIFT(t: lex.LexToken) -> lex.LexToken:
+    r"<<"
+    return t
+
+
+def t_R_SHIFT(t: lex.LexToken) -> lex.LexToken:
+    r">>"
+    return t
+
+
+def t_STRING_LITERAL(t: lex.LexToken) -> lex.LexToken:
+    r"""('((\\['tn])|[^'\\])*')|("((\\["tn])|[^"\\])*")"""
+    return t
+
+
+# TODO: maybe convert this to a t_ignore() rule for improved lexing performance
+def t_NEWLINE(t: lex.LexToken) -> lex.LexToken:
+    r"\n"
+    t.type = 'NEWLINE'
+    t.lexer.lineno += 1  # the lexer doesn't know what consistutes a 'line' unless we tell it
+    symtable.LINE_COUNT += 1
+    return t
+
+
+def t_WHITESPACE(t: lex.LexToken) -> lex.LexToken:
+    r"[\t ]"
+    return t
+
+
+def t_error(t: lex.LexToken) -> lex.LexToken:
+    raise symtable.PepperSyntaxError(f"Unknown token on line {t.lexer.lineno}: {t.value[0]}")
+
+
+lexer = lex.lex()
+
+
+def lex(lines: List[str], debug_mode: bool = False) -> None:
+    "Takes a single string, containing newlines, that's the entire input"
+    lexer.input(lines)
+
+    arcade: List[lex.LexToken] = []
+    tok: Union[lex.LexToken, bool] = True
+    while True:
+        tok = lexer.token()
+        if not tok:
+            break  # end of file reached
+        arcade.append(tok)
+
+    for token in arcade:
+        try:
+            if token.type in ignore:
+                if debug_mode:
+                    print(f"(IGNORED:) {token.type}: {token.value}")
+                else:
+                    continue
+            elif token.type in literals:
+                print(f"ASCII_LITERAL: {token.value}")
+            elif token.type != 'UNKNOWN':
+                print(f"{token.type}: {token.value}")
+            else:
+                print(f"Unknown token in input: {token.value}")
+                sys.exit(1)
+        except: # NOQA
+            print(f'Blew up trying to access type of {token}')
+
+
+def get_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('input_file',
+                        type=argparse.FileType('r'),
+                        default=sys.stdin,
+                        help="The file to lex")
+    parser.add_argument('--debug_mode', action='store_true')
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = get_args()
+
+    lex(args.input_file.read(), args.debug_mode)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/pepper/preprocessor_language_parser.py b/src/pepper/preprocessor_language_parser.py
@@ -0,0 +1,145 @@
+#! /usr/bin/env python3
+
+# This file is a part of the Pepper project, https://github.com/devosoft/Pepper
+# (C) Michigan State University, under the MIT License
+# See LICENSE.txt for more information
+
+"""
+This is the Parser module for Pepper
+
+This module implements the grammar for the preprocessor language, comprised of tokens from the Lexer module.
+This module implements a main function, but this is only for debugging and will be removed on release.
+"""
+
+# flake8: noqa E501
+import pepper.symbol_table as symtable
+import pepper.abstract_symbol_tree as ast
+import sys
+import argparse
+import ply.yacc as yacc
+from pepper.preprocessor_language_lexer import lexer
+from pepper.preprocessor_language_lexer import tokens  # NOQA
+import pepper.symbol_table as symtable
+from pepper.evaluator import parse_lines, parse_macro
+from pepper.symbol_table import Node
+from typing import List, cast
+
+
+print(f"tokens: {tokens}")
+
+
+def p_statement(p: yacc.YaccProduction) -> yacc.YaccProduction:
+    """
+    program : '#' preprocessing_statement NEWLINE
+    """
+    pass
+
+
+def p_preprocessing_statement_to_all_statement_types(p: yacc.YaccProduction) -> yacc.YaccProduction:
+    """
+    preprocessing_statement : define_statement
+                            | include_statement
+    """
+    pass
+
+
+def p_define_statement_structure(p: yacc.YaccProduction) -> yacc.YaccProduction:
+    pass
+
+
+def p_define_expression_no_args(p: yacc.YaccProduction) -> yacc.YaccProduction:
+    """
+    define_statement : PREPROCESSING_KEYWORD_DEFINE maybe_space IDENTIFIER maybe_space expressions
+    """
+    # p[0] = symtable.MacroExpansion(p[3], p[5])
+    pass
+
+
+# def p_define_expression_some_args(p: yacc.YaccProduction) -> yacc.YaccProduction:
+#     """
+#     define_expression : PREPROCESSING_KEYWORD_DEFINE maybe_space IDENTIFIER '(' identifier_list ')'  maybe_space expressions
+#     """
+#     # print(f"Macro expansion for ident {p[3]} with args {p[5]}")
+#     # p[0] = symtable.MacroExpansion(p[3], p[8], args=p[5])
+#     pass
+
+
+def p_include_expression_disambiguation(p: yacc.YaccProduction) -> yacc.YaccProduction:
+    """
+    include_statement : include_statement_file
+                       | include_statement_system
+    """
+    p[0] = p[1]
+
+
+def p_include_expression_file(p: yacc.YaccProduction) -> yacc.YaccProduction:
+    """
+    include_statement_file : PREPROCESSING_KEYWORD_INCLUDE maybe_space STRING_LITERAL
+    """
+    p[0] = ast.PreprocessorIncludeNode([p[3]], False)
+
+
+def p_include_expression_system(p: yacc.YaccProduction) -> yacc.YaccProduction:
+    """
+    include_statement_system : PREPROCESSING_KEYWORD_INCLUDE maybe_space SYSTEM_INCLUDE_LITERAL
+    """
+    p[0] = ast.PreprocessorIncludeNode([p[3]], True)
+
+
+def p_maybe_space_empty(p: yacc.YaccProduction) -> yacc.YaccProduction:
+    """
+    maybe_space :
+    """
+    pass
+
+
+def p_maybe_space_nonempty(p: yacc.YaccProduction) -> yacc.YaccProduction:
+    """
+    maybe_space : WHITESPACE
+    """
+    pass
+
+
+def p_expressions(p):
+    """
+    expressions : IDENTIFIER
+    """
+    pass
+
+
+def p_error(p: yacc.YaccProduction) -> yacc.YaccProduction:
+    print(f"ERROR(line {p.lineno}): syntax error")
+    print(p)
+    raise symtable.PepperSyntaxError()
+
+
+def get_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('input_file',
+                        type=argparse.FileType('r'),
+                        default=sys.stdin,
+                        help="The file to parse")
+    parser.add_argument('--debug_mode', action='store_true')
+    return parser.parse_args()
+
+
+def parse(source: str, debug_mode: bool=False) -> Node:
+    if debug_mode:
+        parser = yacc.yacc(debug=True)
+    else:
+        parser = yacc.yacc(debug=False, errorlog=yacc.NullLogger())
+    parse_tree: Node = parser.parse(source, lexer=lexer)
+
+    return parse_tree
+
+
+def main() -> None:
+    args = get_args()
+
+    # source = "\n".join(args.input_file.readlines())
+    parse_tree = parse(args.input_file.read(), args.debug_mode)
+    print(parse_tree)
+
+
+if __name__ == "__main__":
+    main()