Module src.frontend.lexer

The lexer class definition. Uses rplys lexer as a jumping off point, and regexes and token names are added in order to tokenize.

Expand source code
"""
The lexer class definition. Uses rplys lexer as a jumping off point, and regexes and token names are added in order to tokenize.
"""

from rply import LexerGenerator
from rply.errors import LexingError
import copy
import re

class Lexer():
    """
    The lexer class definition
    """
    def __init__(self):
        """
        Constructs the Lexer object.
        """
        self.lexer = LexerGenerator()

    def _add_tokens(self):
        """
        Adds tokens to the rply lexer object
        """
        self.lexer.add("COMMENT",       r"/(\*(\w|\W)*?\*/|/([^\n]*))") # Catches both multi-line and single line comments
        self.lexer.add("PREPROCESSOR",  r"#\s*(warning|else|endif|include|undef|ifdef|ifndef|if|elif|pragma|define|if|elif|error|pragma|line)([\t\f ]+[^\s]+)*")
        self.lexer.add("CHAR",          r"\'\\?[\w\;\\ \%\"\']\'")
        self.lexer.add("STRING",        r"(\"[^\n]*?(?<!\\)\")|(\'[^\n]*?(?<!\\)\')") # Classifies single characters and multiple characters as a string
        self.lexer.add("HEX",           r"0x[\dA-Fa-f]+")
        self.lexer.add("OCT",           r"0[0-7]{1,3}")
        self.lexer.add("BIN",           r"0b[01]+")
        self.lexer.add("PRECISION",     r"(\d|[1-9]\d+)?\.\d*")
        self.lexer.add("INTEGER",       r"([1-9]\d*|\d)")
        self.lexer.add("EQ",            r"={2}")
        self.lexer.add("AEQ",           r"\+=")
        self.lexer.add("SEQ",           r"-=")
        self.lexer.add("MEQ",           r"\*=")
        self.lexer.add("DEQ",           r"/=")
        self.lexer.add("MODEQ",         r"%=")
        self.lexer.add("LSEQ",          r"<{2}=")
        self.lexer.add("RSEQ",          r">{2}=")
        self.lexer.add("LSH",           r"<{2}")
        self.lexer.add("RSH",           r">{2}")
        self.lexer.add("BOEQ",          r"\|=")
        self.lexer.add("BAEQ",          r"&=")
        self.lexer.add("XEQ",           r"\^=")
        self.lexer.add("LEQ",           r"<=")
        self.lexer.add("GEQ",           r">=")
        self.lexer.add("NEQ",           r"!=")
        self.lexer.add("LT",            r"<")
        self.lexer.add("GT",            r">")
        self.lexer.add("SET",           r"=")
        self.lexer.add("INC",           r"\+{2}")
        self.lexer.add("DEC",           r"-{2}")
        self.lexer.add("AND",           r"&{2}")
        self.lexer.add("OR",            r"\|{2}")
        self.lexer.add("MOD",           r"%")
        self.lexer.add("MUL",           r"\*")
        self.lexer.add("DIV",           r"/")
        self.lexer.add("ADD",           r"\+")
        self.lexer.add("SUB",           r"-")
        self.lexer.add("NOT",           r"!")
        self.lexer.add("BOR",           r"\|")
        self.lexer.add("BAND",          r"&")
        self.lexer.add("XOR",           r"\^")
        self.lexer.add("COMP",          r"~")
        self.lexer.add("ACCESS",        r"->|\.")
        self.lexer.add("SIZEOF",        r"\bsizeof\b")
        self.lexer.add("TYPEDEF",       r"\btypedef\b")
        self.lexer.add("FUNC_MODIF",    r"\binline\b")
        self.lexer.add("VAR_MODIF",     r"\b(register|volatile)\b")
        self.lexer.add("BOTH_MODIF",    r"\b(const|signed|static|unsigned|extern)\b")
        self.lexer.add("GOTO",          r"\bgoto\b")
        self.lexer.add("RETURN",        r"\breturn\b")
        self.lexer.add("BREAK",         r"\bbreak\b")
        self.lexer.add("CONTINUE",      r"\bcontinue\b")
        self.lexer.add("FOR_LOOP",      r"\bfor\b")
        self.lexer.add("WHILE_LOOP",    r"\bwhile\b")
        self.lexer.add("DO_LOOP",       r"\bdo\b")
        self.lexer.add("IF_BRANCH",     r"\bif\b")
        self.lexer.add("ELSE_BRANCH",   r"\belse\b")
        self.lexer.add("SWITCH_BRANCH", r"\bswitch\b")
        self.lexer.add("CASE",          r"\bcase\b")
        self.lexer.add("DEFAULT",       r"\bdefault\b")
        self.lexer.add("NULL",          r"\bNULL\b")
        self.lexer.add("TYPE",          r"\b(auto|long double|double|float|long long( int)?|long|int|short|char|void)\b")
        self.lexer.add("MEM_STRUCT",    r"\b(struct|union|enum)\b")
        self.lexer.add("SELF_DEFINED",  r"[a-zA-Z_]\w*")
        self.lexer.add("OPEN_PAREN",    r"\(")
        self.lexer.add("CLOSE_PAREN",   r"\)")
        self.lexer.add("OPEN_BRACE",    r"\{")
        self.lexer.add("CLOSE_BRACE",   r"\}")
        self.lexer.add("OPEN_BRACK",    r"\[")
        self.lexer.add("CLOSE_BRACK",   r"\]")
        self.lexer.add("QUESTION",      r"\?")
        self.lexer.add("SEMICOLON",     r";")
        self.lexer.add("COLON",         r":")
        self.lexer.add("COMMA",         r",")
        self.lexer.add("INVALID",       r".+?") # Just to catch stuff we havent thought about yet
        self.lexer.ignore(r'\s+')
        self.lexer.ignore(r'\n')
        self.lexer.ignore(r'\t')

    def get_lexer(self):
        """
        Retrieves the lexer, with the tokens added to the inner lexer object.

        Returns:
            The lexer, now built with the tokens added
        """
        self._add_tokens()
        return self.lexer.build()

def tokensToString(tokens):
    """
    Iterates through the tokens and generates a string of all of them

    Args:
        tokens: The token object that is returned from the lexer.
    """
    return "\n".join([str(x) for x in tokens])

def validateTokens(tokens):
    """
    Validates the given token list.

    Args:
        tokens: The token object that is returned from the lexer.

    Returns:
        The same token object if there were no invalid tokens.

    Raises:
        LexingError: If there is at least one invalid token this is raised.
    """
    cpy = copy.deepcopy(tokens)
    status = "PASS"

    # Goes through copy list and looks for any "invalid tokens".
    # The lexer will mark any unknown tokens with the name, "INVALID".
    for i in cpy:
        if (i.name == "INVALID"):
            print_error(i)
            status = "FAIL" # status is changed


    if (status == "FAIL"):
        raise LexingError("invalid token", i.source_pos)
    else:
        return tokens

def print_error(token):
    """
    Prints lexer error message. Currently we only experience invalid token
    errors. The input `token` is a `Token` object, imported from `rply`.

    Args:
        token: The token object that is returned from the lexer.
    """
    print(f"LexingError: Invalid Token \'{token.value}\' at, {token.source_pos}\n")

Functions

def print_error(token)

Prints lexer error message. Currently we only experience invalid token errors. The input token is a Token object, imported from rply.

Args

token
The token object that is returned from the lexer.
Expand source code
def print_error(token):
    """
    Prints lexer error message. Currently we only experience invalid token
    errors. The input `token` is a `Token` object, imported from `rply`.

    Args:
        token: The token object that is returned from the lexer.
    """
    print(f"LexingError: Invalid Token \'{token.value}\' at, {token.source_pos}\n")
def tokensToString(tokens)

Iterates through the tokens and generates a string of all of them

Args

tokens
The token object that is returned from the lexer.
Expand source code
def tokensToString(tokens):
    """
    Iterates through the tokens and generates a string of all of them

    Args:
        tokens: The token object that is returned from the lexer.
    """
    return "\n".join([str(x) for x in tokens])
def validateTokens(tokens)

Validates the given token list.

Args

tokens
The token object that is returned from the lexer.

Returns

The same token object if there were no invalid tokens.

Raises

LexingError
If there is at least one invalid token this is raised.
Expand source code
def validateTokens(tokens):
    """
    Validates the given token list.

    Args:
        tokens: The token object that is returned from the lexer.

    Returns:
        The same token object if there were no invalid tokens.

    Raises:
        LexingError: If there is at least one invalid token this is raised.
    """
    cpy = copy.deepcopy(tokens)
    status = "PASS"

    # Goes through copy list and looks for any "invalid tokens".
    # The lexer will mark any unknown tokens with the name, "INVALID".
    for i in cpy:
        if (i.name == "INVALID"):
            print_error(i)
            status = "FAIL" # status is changed


    if (status == "FAIL"):
        raise LexingError("invalid token", i.source_pos)
    else:
        return tokens

Classes

class Lexer

The lexer class definition

Constructs the Lexer object.

Expand source code
class Lexer():
    """
    The lexer class definition
    """
    def __init__(self):
        """
        Constructs the Lexer object.
        """
        self.lexer = LexerGenerator()

    def _add_tokens(self):
        """
        Adds tokens to the rply lexer object
        """
        self.lexer.add("COMMENT",       r"/(\*(\w|\W)*?\*/|/([^\n]*))") # Catches both multi-line and single line comments
        self.lexer.add("PREPROCESSOR",  r"#\s*(warning|else|endif|include|undef|ifdef|ifndef|if|elif|pragma|define|if|elif|error|pragma|line)([\t\f ]+[^\s]+)*")
        self.lexer.add("CHAR",          r"\'\\?[\w\;\\ \%\"\']\'")
        self.lexer.add("STRING",        r"(\"[^\n]*?(?<!\\)\")|(\'[^\n]*?(?<!\\)\')") # Classifies single characters and multiple characters as a string
        self.lexer.add("HEX",           r"0x[\dA-Fa-f]+")
        self.lexer.add("OCT",           r"0[0-7]{1,3}")
        self.lexer.add("BIN",           r"0b[01]+")
        self.lexer.add("PRECISION",     r"(\d|[1-9]\d+)?\.\d*")
        self.lexer.add("INTEGER",       r"([1-9]\d*|\d)")
        self.lexer.add("EQ",            r"={2}")
        self.lexer.add("AEQ",           r"\+=")
        self.lexer.add("SEQ",           r"-=")
        self.lexer.add("MEQ",           r"\*=")
        self.lexer.add("DEQ",           r"/=")
        self.lexer.add("MODEQ",         r"%=")
        self.lexer.add("LSEQ",          r"<{2}=")
        self.lexer.add("RSEQ",          r">{2}=")
        self.lexer.add("LSH",           r"<{2}")
        self.lexer.add("RSH",           r">{2}")
        self.lexer.add("BOEQ",          r"\|=")
        self.lexer.add("BAEQ",          r"&=")
        self.lexer.add("XEQ",           r"\^=")
        self.lexer.add("LEQ",           r"<=")
        self.lexer.add("GEQ",           r">=")
        self.lexer.add("NEQ",           r"!=")
        self.lexer.add("LT",            r"<")
        self.lexer.add("GT",            r">")
        self.lexer.add("SET",           r"=")
        self.lexer.add("INC",           r"\+{2}")
        self.lexer.add("DEC",           r"-{2}")
        self.lexer.add("AND",           r"&{2}")
        self.lexer.add("OR",            r"\|{2}")
        self.lexer.add("MOD",           r"%")
        self.lexer.add("MUL",           r"\*")
        self.lexer.add("DIV",           r"/")
        self.lexer.add("ADD",           r"\+")
        self.lexer.add("SUB",           r"-")
        self.lexer.add("NOT",           r"!")
        self.lexer.add("BOR",           r"\|")
        self.lexer.add("BAND",          r"&")
        self.lexer.add("XOR",           r"\^")
        self.lexer.add("COMP",          r"~")
        self.lexer.add("ACCESS",        r"->|\.")
        self.lexer.add("SIZEOF",        r"\bsizeof\b")
        self.lexer.add("TYPEDEF",       r"\btypedef\b")
        self.lexer.add("FUNC_MODIF",    r"\binline\b")
        self.lexer.add("VAR_MODIF",     r"\b(register|volatile)\b")
        self.lexer.add("BOTH_MODIF",    r"\b(const|signed|static|unsigned|extern)\b")
        self.lexer.add("GOTO",          r"\bgoto\b")
        self.lexer.add("RETURN",        r"\breturn\b")
        self.lexer.add("BREAK",         r"\bbreak\b")
        self.lexer.add("CONTINUE",      r"\bcontinue\b")
        self.lexer.add("FOR_LOOP",      r"\bfor\b")
        self.lexer.add("WHILE_LOOP",    r"\bwhile\b")
        self.lexer.add("DO_LOOP",       r"\bdo\b")
        self.lexer.add("IF_BRANCH",     r"\bif\b")
        self.lexer.add("ELSE_BRANCH",   r"\belse\b")
        self.lexer.add("SWITCH_BRANCH", r"\bswitch\b")
        self.lexer.add("CASE",          r"\bcase\b")
        self.lexer.add("DEFAULT",       r"\bdefault\b")
        self.lexer.add("NULL",          r"\bNULL\b")
        self.lexer.add("TYPE",          r"\b(auto|long double|double|float|long long( int)?|long|int|short|char|void)\b")
        self.lexer.add("MEM_STRUCT",    r"\b(struct|union|enum)\b")
        self.lexer.add("SELF_DEFINED",  r"[a-zA-Z_]\w*")
        self.lexer.add("OPEN_PAREN",    r"\(")
        self.lexer.add("CLOSE_PAREN",   r"\)")
        self.lexer.add("OPEN_BRACE",    r"\{")
        self.lexer.add("CLOSE_BRACE",   r"\}")
        self.lexer.add("OPEN_BRACK",    r"\[")
        self.lexer.add("CLOSE_BRACK",   r"\]")
        self.lexer.add("QUESTION",      r"\?")
        self.lexer.add("SEMICOLON",     r";")
        self.lexer.add("COLON",         r":")
        self.lexer.add("COMMA",         r",")
        self.lexer.add("INVALID",       r".+?") # Just to catch stuff we havent thought about yet
        self.lexer.ignore(r'\s+')
        self.lexer.ignore(r'\n')
        self.lexer.ignore(r'\t')

    def get_lexer(self):
        """
        Retrieves the lexer, with the tokens added to the inner lexer object.

        Returns:
            The lexer, now built with the tokens added
        """
        self._add_tokens()
        return self.lexer.build()

Methods

def get_lexer(self)

Retrieves the lexer, with the tokens added to the inner lexer object.

Returns

The lexer, now built with the tokens added
 
Expand source code
def get_lexer(self):
    """
    Retrieves the lexer, with the tokens added to the inner lexer object.

    Returns:
        The lexer, now built with the tokens added
    """
    self._add_tokens()
    return self.lexer.build()