Source code for mwptoolkit.utils.preprocess_tool.equation_operator

import random
from copy import deepcopy
from typing import Tuple, List, Union

from mwptoolkit.utils.enum_type import SpecialTokens,NumMask,EPT
from mwptoolkit.utils.preprocess_tool.number_operator import constant_number

[docs]def from_infix_to_postfix(expression):
    r"""convert infix equation to postfix equation.

    Args:
        expression (list): infix expression.
    
    Returns:
        (list): postfix expression.
    """
    st = list()
    res = list()
    priority = {"<BRG>": 0, "=": 1, "+": 2, "-": 2, "*": 3, "/": 3, "^": 4}
    for e in expression:
        if e in ["(", "["]:
            st.append(e)
        elif e == ")":
            c = st.pop()
            while c != "(":
                res.append(c)
                c = st.pop()
        elif e == "]":
            c = st.pop()
            while c != "[":
                res.append(c)
                c = st.pop()
        elif e in priority:
            while len(st) > 0 and st[-1] not in ["(", "["] and priority[e] <= priority[st[-1]]:
                res.append(st.pop())
            st.append(e)
        else:
            res.append(e)
    while len(st) > 0:
        res.append(st.pop())
    return res


[docs]def from_infix_to_prefix(expression):
    r"""convert infix equation to prefix equation

    Args:
        expression (list): infix expression.
    
    Returns:
        (list): prefix expression.
    """
    st = list()
    res = list()
    priority = {"<BRG>": 0, "=": 1, "+": 2, "-": 2, "*": 3, "/": 3, "^": 4}
    expression = deepcopy(expression)
    expression.reverse()
    for e in expression:
        if e in [")", "]"]:
            st.append(e)
        elif e == "(":
            c = st.pop()
            while c != ")":
                res.append(c)
                c = st.pop()
        elif e == "[":
            c = st.pop()
            while c != "]":
                res.append(c)
                c = st.pop()
        elif e in priority:
            while len(st) > 0 and st[-1] not in [")", "]"] and priority[e] < priority[st[-1]]:
                res.append(st.pop())
            st.append(e)
        else:
            res.append(e)
    while len(st) > 0:
        res.append(st.pop())
    res.reverse()
    return res


[docs]def from_prefix_to_postfix(expression):
    r"""convert prefix equation to postfix equation

    Args:
        expression (list): prefix expression.
    
    Returns:
        (list): postfix expression.
    """
    st = list()
    expression = deepcopy(expression)
    expression.reverse()
    for symbol in expression:
        if symbol not in ['+', '-', '*', '/', '^', "=", "<BRG>"]:
            st.append([symbol])
        else:
            n1 = st.pop()
            n2 = st.pop()
            st.append(n1 + n2 + [symbol])
    res = st.pop()
    return res


[docs]def from_postfix_to_prefix(expression):
    r"""convert postfix equation to prefix equation

    Args:
        expression (list): postfix expression.
    
    Returns:
        (list): prefix expression.
    """
    st = list()
    for symbol in expression:
        if symbol not in ['+', '-', '*', '/', '^', "=", "<BRG>"]:
            st.append([symbol])
        else:
            n1 = st.pop()  #2
            n2 = st.pop()  #3
            st.append([symbol] + n2 + n1)
    res = st.pop()
    return res


[docs]def from_prefix_to_infix(expression):
    r"""convert prefix equation to infix equation

    Args:
        expression (list): prefix expression.
    
    Returns:
        (list): infix expression.
    """
    st = list()
    last_op = []
    priority = {"<BRG>": 0, "=": 1, "+": 2, "-": 2, "*": 3, "/": 3, "^": 4}
    expression = deepcopy(expression)
    expression.reverse()
    for symbol in expression:
        if symbol not in ['+', '-', '*', '/', '^', "=", "<BRG>"]:
            st.append([symbol])
        else:
            n_left = st.pop()
            n_right = st.pop()
            left_first = False
            right_first = False
            if len(n_left) > 1 and priority[last_op.pop()] < priority[symbol]:
                left_first = True
            if len(n_right) > 1 and priority[last_op.pop()] <= priority[symbol]:
                right_first = True
            if left_first:
                n_left = ['('] + n_left + [')']
            if right_first:
                n_right = ['('] + n_right + [')']
            st.append(n_left + [symbol] + n_right)
            last_op.append(symbol)
    res = st.pop()
    return res


[docs]def from_postfix_to_infix(expression):
    r"""convert postfix equation to infix equation

    Args:
        expression (list): postfix expression.
    
    Returns:
        (list): infix expression.
    """
    st = list()
    last_op = []
    priority = {"<BRG>": 0, "=": 1, "+": 2, "-": 2, "*": 3, "/": 3, "^": 4}
    for symbol in expression:
        if symbol not in ['+', '-', '*', '/', '^', "=", "<BRG>"]:
            st.append([symbol])
        else:
            n_right = st.pop()
            n_left = st.pop()
            left_first = False
            right_first = False
            if len(n_right) > 1 and priority[last_op.pop()] <= priority[symbol]:
                right_first = True
            if len(n_left) > 1 and priority[last_op.pop()] < priority[symbol]:
                left_first = True
            if left_first:
                n_left = ['('] + n_left + [')']
            if right_first:
                n_right = ['('] + n_right + [')']
            st.append(n_left + [symbol] + n_right)
            last_op.append(symbol)
    res = st.pop()
    return res


[docs]def from_infix_to_multi_way_tree(expression):
    res = []
    st = []
    level = 0
    for e in expression:
        if e in ['(', '[']:
            level += 1
            st.append(e)
        elif e in [')', ']']:
            level -= 1
            st.append(e)
            if level == 0:
                sub_res = from_infix_to_multi_way_tree(st[1:-1])
                res.append(sub_res)
                st = []
        else:
            if level != 0:
                st.append(e)
            else:
                res.append(e)
    return res


[docs]def postfix_parser(equation, memory: list) -> int:
    """
    Read Op-token postfix equation and transform it into Expression-token sequence.

    :param List[Union[str,Tuple[str,Any]]] equation:
        List of op-tokens to be parsed into a Expression-token sequence
        Item of this list should be either
        - an operator string
        - a tuple of (operand source, operand value)
    :param list memory:
        List where previous execution results of expressions are stored
    :rtype: int
    :return:
        Size of stack after processing. Value 1 means parsing was done without any free expression.
    """
    stack = []

    for tok in equation:
        if tok in EPT.OPERATORS:
            # If token is an operator, form expression and push it into the memory and stack.
            op = EPT.OPERATORS[tok]
            arity = op['arity']

            # Retrieve arguments
            args = stack[-arity:]
            stack = stack[:-arity]

            # Store the result with index where the expression stored
            stack.append((EPT.ARG_MEM, len(memory)))
            # Store the expression into the memory.
            memory.append((tok, args))
        else:
            # Push an operand before meet an operator
            stack.append(tok)

    return len(stack)


[docs]def orig_infix_to_postfix(equation: Union[str, List[str]], number_token_map: dict, free_symbols: list,
                     join_output: bool = True):
    """
    Read infix equation string and convert it into a postfix string

    :param Union[str,List[str]] equation:
        Either one of these.
        - A single string of infix equation. e.g. "5 + 4"
        - Tokenized sequence of infix equation. e.g. ["5", "+", "4"]
    :param dict number_token_map:
        Mapping from a number token to its anonymized representation (e.g. N_0)
    :param list free_symbols:
        List of free symbols (for return)
    :param bool join_output:
        True if the output need to be joined. Otherwise, this method will return the tokenized postfix sequence.
    :rtype: Union[str, List[str]]
    :return:
        Either one of these.
        - A single string of postfix equation. e.g. "5 4 +"
        - Tokenized sequence of postfix equation. e.g. ["5", "4", "+"]
    """
    # Template in ALG514/DRAW is already tokenized, without parenthesis.
    # Template in MAWPS is also tokenized but contains parenthesis.
    output_tokens = []
    postfix_stack = []

    # Tokenize the string if that is not tokenized yet.
    if type(equation) is str:
        equation = equation.strip().split(' ')

    # Read each token
    for tok in equation:
        # Ignore blank token
        if not tok:
            continue

        if tok == ')':
            # Pop until find the opening paren '('
            while postfix_stack:
                top = postfix_stack.pop()
                if top == '(':
                    # Discard the matching '('
                    break
                else:
                    output_tokens.append(top)
        elif tok in '*/+-=(':
            # '(' has the highest precedence when in the input string.
            precedence = EPT.OPERATOR_PRECEDENCE.get(tok, 1E9)

            while postfix_stack:
                # Pop until the top < current_precedence.
                # '(' has the lowest precedence in the stack.
                top = postfix_stack[-1]
                if EPT.OPERATOR_PRECEDENCE.get(top, -1E9) < precedence:
                    break
                else:
                    output_tokens.append(postfix_stack.pop())
            postfix_stack.append(tok)
        elif EPT.NUMBER_PATTERN.fullmatch(tok) is not None:
            # Just output the operand.
            positive, const_code = constant_number(tok)
            if not positive:
                const_code = const_code + '_NEG'
            output_tokens.append(const_code)
        elif tok in number_token_map:
            # Just output the operand
            output_tokens += number_token_map[tok]
        else:
            # This is a variable name
            # Just output the operand.
            if tok not in free_symbols:
                free_symbols.append(tok)

            tok = 'X_%s' % free_symbols.index(tok)
            output_tokens.append(tok)

    while postfix_stack:
        output_tokens.append(postfix_stack.pop())

    if join_output:
        return ' '.join(output_tokens)
    else:
        return output_tokens


[docs]def infix_to_postfix(equation, free_symbols: list,
                     join_output: bool = True):

    output_tokens = []
    postfix_stack = []

    # Tokenize the string if that is not tokenized yet.
    if type(equation) is str:
        equation = equation.strip().split(' ')

    # Read each token
    for tok in equation:
        # Ignore blank token
        if not tok:
            continue

        if tok == ')':
            # Pop until find the opening paren '('
            while postfix_stack:
                top = postfix_stack.pop()
                if top == '(':
                    # Discard the matching '('
                    break
                else:
                    output_tokens.append(top)
        elif tok in '*/+-=^(':
            # '(' has the highest precedence when in the input string.
            precedence = EPT.OPERATOR_PRECEDENCE.get(tok, 1E9)

            while postfix_stack:
                # Pop until the top < current_precedence.
                # '(' has the lowest precedence in the stack.
                top = postfix_stack[-1]
                if EPT.OPERATOR_PRECEDENCE.get(top, -1E9) < precedence:
                    break
                else:
                    output_tokens.append(postfix_stack.pop())
            postfix_stack.append(tok)
        elif EPT.NUMBER_PATTERN.fullmatch(tok) is not None:
            # Just output the operand.
            positive, const_code = constant_number(tok)
            if not positive:
                const_code = const_code + '_NEG'
            output_tokens.append(const_code)
        elif 'NUM_' in tok:
            output_tokens.append('N_'+tok[4:])

        else:
            # This is a variable name
            # Just output the operand.
            if tok not in free_symbols:
                free_symbols.append(tok)

            tok = 'X_%s' % free_symbols.index(tok)
            output_tokens.append(tok)

    while postfix_stack:
        output_tokens.append(postfix_stack.pop())

    if join_output:
        return ' '.join(output_tokens)
    else:
        return output_tokens



[docs]def operator_mask(expression):
    template = []
    for symbol in expression:
        if isinstance(symbol, list):
            sub_temp = operator_mask(symbol)
            template.append(sub_temp)
        elif symbol in ["+", "-", "*", "/", "^", "=", "<BRG>"]:
            template.append(SpecialTokens.OPT_TOKEN)
        else:
            template.append(symbol)
    return template


[docs]def trans_symbol_2_number(equ_list, num_list):
    """transfer mask symbol in equation to number.

    Args:
        equ_list (list): equation.
        num_list (list): number list.
    
    Return:
        (list): equation.
    """
    symbol_list = NumMask.number
    new_equ_list = []
    for symbol in equ_list:
        if 'NUM' in symbol:
            index = symbol_list.index(symbol)
            new_equ_list.append(str(num_list[index]))
        else:
            new_equ_list.append(symbol)
    return new_equ_list


[docs]def EN_rule1_stat(datas, sample_k=100):
    """equation norm rule1

    Args:
        datas (list): dataset.
        sample_k (int): number of random sample.
        
    Returns:
        (list): classified equations. equivalent equations will be in the same class.
    """
    rule_1 = []
    for data in datas:
        temp_data = data
        equ_list = data["equation"]
        rule_1.append(equ_list)
    samples = random.sample(range(10, 100), k=sample_k)
    random.shuffle(samples)
    ans_dict = {}
    for equ_list in rule_1:
        new_equ = trans_symbol_2_number(equ_list, samples)
        new_equ = ''.join(new_equ)
        new_equ = new_equ.replace("^", "**", 10)
        new_equ = new_equ.replace("[", "(", 10)
        new_equ = new_equ.replace("]", ")", 10)
        try:
            ans = eval(new_equ)
        except:
            ans = float("inf")
        try:
            ans_dict[ans].append(equ_list)
        except:
            ans_dict[ans] = []
            ans_dict[ans].append(equ_list)
    class_list = []
    for k, v in ans_dict.items():
        class_list.append(v)
    for i in range(50):
        samples = random.sample(range(10, 100), k=sample_k)
        random.shuffle(samples)
        class_copy = deepcopy(class_list)
        class_list = []
        for equ_lists in class_copy:
            ans_dict = {}
            for equ_list in equ_lists:
                new_equ = trans_symbol_2_number(equ_list, samples)
                new_equ = ''.join(new_equ)
                new_equ = new_equ.replace("^", "**", 10)
                new_equ = new_equ.replace("[", "(", 10)
                new_equ = new_equ.replace("]", ")", 10)
                try:
                    ans = eval(new_equ)
                except:
                    ans = float("inf")
                try:
                    ans_dict[ans].append(equ_list)
                except:
                    ans_dict[ans] = []
                    ans_dict[ans].append(equ_list)
            for k, v in ans_dict.items():
                class_list.append(v)
    class_copy = deepcopy(class_list)
    class_list = []
    for equ_lists in class_copy:
        class_list_temp = []
        for equ_list in equ_lists:
            if equ_list not in class_list_temp:
                class_list_temp.append(equ_list)
            class_list_temp = sorted(class_list_temp, key=lambda x: len(x), reverse=False)
        class_list.append(class_list_temp)
    return class_list


[docs]def EN_rule2(equ_list):
    """equation norm rule2

    Args:
        equ_list (list): equation.
    
    Returns:
        list: equivalent equation.
    """
    new_list = []
    i = 0
    while i < len(equ_list):
        if (i + 4) < len(equ_list) and ('NUM' in equ_list[i] or equ_list[i].isalpha()) and '+' in equ_list[i + 1] and (
                'NUM' in equ_list[i + 2] or equ_list[i + 2].isalpha()) and '+' in equ_list[i + 3] and ('NUM' in equ_list[i + 4] or equ_list[i + 4].isalpha()):
            if i - 1 >= 0 and equ_list[i - 1] in ['/', '-', '*']:
                new_list.append(equ_list[i])
                i += 1
                continue
            if i + 5 < len(equ_list) and equ_list[i + 5] in ['/', '-', '*']:
                new_list.append(equ_list[i])
                i += 1
                continue
            temp = [equ_list[i], equ_list[i + 2], equ_list[i + 4]]
            sort_temp = sorted(temp)
            new_temp = sort_temp[0:1] + ['+'] + sort_temp[1:2] + ['+'] + sort_temp[2:3]
            new_list += new_temp
            i += 5
        elif (i + 4) < len(equ_list) and ('NUM' in equ_list[i] or equ_list[i].isalpha()) and '*' in equ_list[i + 1] and (
                'NUM' in equ_list[i + 2] or equ_list[i + 2].isalpha()) and '*' in equ_list[i + 3] and ('NUM' in equ_list[i + 4] or equ_list[i + 4].isalpha()):
            if i - 1 >= 0 and equ_list[i - 1] in ['/', '-']:
                new_list.append(equ_list[i])
                i += 1
                continue
            if i + 5 < len(equ_list) and equ_list[i + 5] in ['/', '-']:
                new_list.append(equ_list[i])
                i += 1
                continue
            temp = [equ_list[i], equ_list[i + 2], equ_list[i + 4]]
            sort_temp = sorted(temp)
            new_temp = sort_temp[0:1] + ['*'] + sort_temp[1:2] + ['*'] + sort_temp[2:3]
            new_list += new_temp
            i += 5
        elif (i + 2) < len(equ_list) and ('NUM' in equ_list[i] or equ_list[i].isalpha()) and '+' in equ_list[i + 1] and ('NUM' in equ_list[i + 2] or equ_list[i + 2].isalpha()):
            if i - 1 >= 0 and equ_list[i - 1] in ['/', '-', '*']:
                new_list.append(equ_list[i])
                i += 1
                continue
            if i + 3 < len(equ_list) and equ_list[i + 3] in ['/', '-', '*']:
                new_list.append(equ_list[i])
                i += 1
                continue
            temp = [equ_list[i], equ_list[i + 2]]
            sort_temp = sorted(temp)
            new_temp = sort_temp[0:1] + ['+'] + sort_temp[1:2]
            new_list += new_temp
            i += 3
        elif (i + 2) < len(equ_list) and ('NUM' in equ_list[i] or equ_list[i].isalpha()) and '*' in equ_list[i + 1] and ('NUM' in equ_list[i + 2] or equ_list[i + 2].isalpha()):
            if i - 1 >= 0 and equ_list[i - 1] in ['/', '-']:
                new_list.append(equ_list[i])
                i += 1
                continue
            if i + 3 < len(equ_list) and equ_list[i + 3] in ['/', '-']:
                new_list.append(equ_list[i])
                i += 1
                continue
            temp = [equ_list[i], equ_list[i + 2]]
            sort_temp = sorted(temp)
            new_temp = sort_temp[0:1] + ['*'] + sort_temp[1:2]
            new_list += new_temp
            i += 3
        else:
            new_list.append(equ_list[i])
            i += 1
    return new_list