import itertools
import nltk
import re
import gooddata_sdk as gd
import sys
import create_insight
from constants import *
from fuzzywuzzy import fuzz
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction import _stop_words


# Need to download these once before running the script. You do not need to download them again afterwards.
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Helper function to read datetime_keywords.txt
def get_datetime_keywords(filename):
    output = {}
    file = open(filename, 'r')
    lines = file.readlines()
    for line in lines:
        elems = line.strip().split(',')
        for elem in elems:
            output[elem] = [v for v in elems if v != elem]
    file.close()
    return output


# Helper function to help lemmatizer
def penn2morphy(penntag):
    # Converts Penn Treebank tags to WordNet
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'


# Convert user's input and lemmatize the phases to find similar phase variation and synonyms
def get_input_synonyms(input):
    tokens = word_tokenize(input)
    lemmatizer = WordNetLemmatizer()
    # Find phase tag to make lemmatizer to function proper
    word_tagged = pos_tag(tokens)
    # Lemmatize each tokenized word from user's command
    tokens = [lemmatizer.lemmatize(word, pos=penn2morphy(tag)) for word, tag in word_tagged]
    # Find synonyms for each token
    token2synonyms = {}
    for token in tokens:
        syn_list = []
        syn_list.append(token)
        for syn_obj in wordnet.synsets(token):
            for phase in syn_obj.lemma_names():
                if phase not in syn_list and not re.search(token, phase):
                    syn_list.append(phase)
        token2synonyms[token] = syn_list
    # Apparently, 'sale' has no return, 'sales' we got 'gross_revenue'
    # Hardcode to put 'sales' and 'revenue' as 'sale' synonyms
    if 'sale' in token2synonyms:
        token2synonyms['sale'].append('sales')
        token2synonyms['sale'].append('revenue')
    if 'revenue' in token2synonyms:
        token2synonyms['revenue'].append('sales') 

    # Unpack synonyms and create the possible input variations
    synonyms_values = []
    for k in token2synonyms:
        if k in _stop_words.ENGLISH_STOP_WORDS:
            synonyms_values.append([k])
        else:
            synonyms_values.append(token2synonyms[k])
    output = list(itertools.product(*synonyms_values))
    output = [' '.join(v) for v in output]
    return output


# Function to chunk sentences based on words that split the sentence into sematic chunks
def chunk_sentence(words, splitter_words):
    first_chunk = words
    second_chunk = ''
    splitter_chunk = ''

    for index, word in reversed(list(enumerate(words))):
        if word in splitter_words:
            first_chunk = words[:index]
            second_chunk = words[index+1:]
            splitter_chunk = words[index]
            break

    return [first_chunk, second_chunk, splitter_chunk]


# Function to compare words using fuzzy matching
def compare_words(dataset_title, ldm_item, input_word, type):
    ratio = max(fuzz.ratio(dataset_title.lower() + " " + ldm_item.title.lower(), input_word), fuzz.ratio(ldm_item.title.lower(), input_word))
    return {"id": ldm_item.id, "title": ldm_item.title, "ratio": ratio, "type": type}


# Function to create the insight definition
def create_insight_definition(input, catalog):
    input = input.lower()
    #Variables to be used
    insight_definition = {}
    filter_attributes_1 = []
    filter_attribute_values_1 = []
    slice_attributes = []
    metrics_measures = []
    aggregation_type_1 = ''
    aggregation_type_2 = ''
    ranking_type = ''
    ranking_num = ''
    insight_definition["viz_type"] = "table" #use table as default if not supplied
    filter_attribute_values_list_1 = []

    # Tokenize input - this splits up individual words using space characters
    words = nltk.word_tokenize(input)

    # Start parsing sentence
    if len(words) > 2:
        for phrase in starting_words:
            if phrase[0] == words[0] and phrase[1] == words[1]:
                words = words[2:]

    for index, word in enumerate(words.copy()):
        if word in chart_words:
            words.remove(word)
            insight_definition["viz_type"] = words[index-1]
            words.remove(words[index-1])
    for index, word in enumerate(words.copy()):
        if word in ignore_words:
            words.remove(word)

    chunks = chunk_sentence(words, filtering_words)
    if chunks[1] != '':
        filter_attribute_values_1.append(chunks[1])
        filter_attributes_1 = chunk_sentence(filter_attribute_values_1[0], equating_words)
        filter_attribute_values_1[0] = filter_attributes_1[1]
        while conjunction_words[0] in filter_attribute_values_1[0] or conjunction_words[1] in filter_attribute_values_1[0]:
            filter_attribute_values_1 = chunk_sentence(filter_attribute_values_1[0], conjunction_words)
            filter_attribute_values_list_1.append(filter_attribute_values_1[1])
        filter_attribute_values_list_1.append(filter_attribute_values_1[0])
    chunks = chunk_sentence(chunks[0], slicing_words)
    if chunks[1] != '':
        slice_attributes = chunk_sentence(chunks[1], conjunction_words[1])

    metrics_measures = chunk_sentence(chunks[0], conjunction_words[1])

    if len(metrics_measures[0]) > 2:
        for phrase in starting_words:
            if phrase[0] == metrics_measures[0][0] and phrase[1] == metrics_measures[0][1]:
                metrics_measures[0] = metrics_measures[0][2:]

    #Find best matching facts and/or metrics
    #We had originally planned to also look for synonyms but it took too long to run the script with that. We should work on improving performance anyway
    metrics_list_1 = []
    best_metric_1 = {"title": '', "id": '', "type": ''}
    metrics_list_2 = []
    best_metric_2 = {"title": '', "id": '', "type": ''}

    for metric in catalog.metrics:
        metrics_list_1.append(compare_words('', metric, metrics_measures[0], 'metric'))
        if metrics_measures[1] != '':
            metrics_list_2.append(compare_words('', metric, metrics_measures[1], 'metric'))

    for dataset in catalog.datasets:
        for fact in dataset.facts:
            for k, v in aggregation_words.items():
                if metrics_measures[0][0] in v:
                    aggregation_type_1 = k
                    metrics_list_1.append(compare_words(dataset.title, fact, metrics_measures[0][1:], 'measure'))
                    break
            if aggregation_type_1 == '':
                aggregation_type_1 = 'Sum' #Use sum as default if not supplied
                metrics_list_1.append(compare_words(dataset.title, fact, metrics_measures[0], 'measure'))
            if metrics_measures[1] != '':
                for k, v in aggregation_words.items():
                    if metrics_measures[1][0] in v:
                        aggregation_type_2 = k
                        metrics_list_2.append(compare_words(dataset.title, fact, metrics_measures[1][1:], 'measure'))
                        break
                if aggregation_type_2 == '':
                    aggregation_type_2 = 'Sum' #Use sum as default if not supplied
                    metrics_list_2.append(compare_words(dataset.title, fact, metrics_measures[1], 'measure'))

    best_metric_1 = max(metrics_list_1, key=lambda x:x['ratio'])
    if best_metric_1["type"] == 'metric':
        aggregation_type_1 = ''
    insight_definition["measures_meta"] = [{"measure_title": best_metric_1["title"], "measure_id": best_metric_1["id"], "measure_type": best_metric_1["type"], "measure_aggregation": aggregation_type_1}]
    if metrics_list_2 != []:
        best_metric_2 = max(metrics_list_2, key=lambda x:x['ratio'])
        if best_metric_2["type"] == 'metric':
            aggregation_type_2 = ''
        insight_definition["measures_meta"].append({"measure_title": best_metric_2["title"], "measure_id": best_metric_2["id"], "measure_type": best_metric_2["type"], "measure_aggregation": aggregation_type_2})

    #Find best match for attributes
    filter_attributes_list_1 = []
    best_filter_attribute_1 = {}
    slice_attributes_list_1 = []
    slice_attributes_list_2 = []
    best_slice_attribute_1 = {}
    best_slice_attribute_2 = {}
    best_filter_attribute_value_1 = []

    #For filtering, we need an exact match, including capitalization. For now, assuming title case. 
    for item in filter_attribute_values_list_1:
        for word in item:
                best_filter_attribute_value_1.append(word.title())

    if filter_attributes_1 != [] or slice_attributes != []:
        for dataset in catalog.datasets:
            for attribute in dataset.attributes:
                if filter_attributes_1 != []:
                    filter_attributes_list_1.append(compare_words(dataset.title, attribute, filter_attributes_1[0], 'attribute'))
                if slice_attributes != []:
                    slice_attributes_list_1.append(compare_words(dataset.title, attribute, slice_attributes[0], 'attribute'))
                    if slice_attributes[1] != '':
                        slice_attributes_list_2.append(compare_words(dataset.title, attribute, slice_attributes[1], 'attribute'))

    if filter_attributes_list_1 != []:
        best_filter_attribute_1 = max(filter_attributes_list_1, key=lambda x:x['ratio'])
        insight_definition["filter_meta"] = {"positiveAttributeFilter": {"filter_id": best_filter_attribute_1["id"], "filter_type": "label", "filter_values": best_filter_attribute_value_1}}
    elif ranking_type != '':
        insight_definition["filter_meta"] = {"rankingFilter": {"filter_id": best_metric_1["id"], "operator": ranking_type, "value": ranking_num}}
    else:
        insight_definition["filter_meta"] = None

    if slice_attributes_list_1 != []:
        best_slice_attribute_1 = max(slice_attributes_list_1, key=lambda x:x['ratio'])
        insight_definition["attribute_meta"] = [{"attribute_id": best_slice_attribute_1["id"], "attribute_type": "label"}]
        if slice_attributes_list_2 != []:
            best_slice_attribute_2 = max(slice_attributes_list_2, key=lambda x:x['ratio'])
            insight_definition["attribute_meta"].append({"attribute_id": best_slice_attribute_2["id"], "attribute_type": "label"})
    elif slice_attributes_list_1 == []:
        insight_definition["attribute_meta"] = None
        insight_definition["viz_type"] = 'headline' #Default to headline if no slice by attr provided

    return insight_definition

"""
Main function to find whether insights exist, return a list to user if exists
If error raise - ValueError: Your change is resulting in a longer fmtstr than 
                 the original length and this is not supported.
    the error raised due to string length restriction display on command line,
    either fix string length or expand the command line shell size
"""
def nlp_search(input, cn_insights, fuzz_ratio):
    print ("Input sentence is: ",input)

    # Obtain the keyword synonyms 
    input_alternatives = [input] + get_input_synonyms(input)

    # Use titles to display choices to users
    titles = []
    # Dictionary to convert user's choice to Insight ID
    title2id = {}

    #Compare input against existing insights
    for insight in cn_insights:
        for input_alternative in input_alternatives:
            ratio = fuzz.ratio(insight.title.lower(), input_alternative)
            if ratio > fuzz_ratio and insight.title not in titles:
                titles.append(insight.title)
                title2id[insight.title] = insight.id

    return titles, title2id

#Define variables
full_host = "http://"+CN_HOST

#Variables for parsing sentence
slicing_words = ['by']
filtering_words = ['where']
conjunction_words = ['or', 'and']
starting_words = [['give', 'me'], ['show', 'me'], ['what', 'are'], ['what', 'is'], ['what', "'s"], ['what', "'re"], ['show', 'my'], ['compute', 'the'], ['compute', 'my']]
aggregation_words = {"Sum": ['total', 'sum', 'all'], "Average": ['average', 'avg', 'mean'], "Max": ['max', 'maximum', 'biggest', 'largest', 'highest', 'most'], "Min": ['min', 'minimum', 'least', 'lowest', 'smallest'], "Median": ['median']}
ordering_words = ['order by']
equating_words = ['is', 'are', 'equals', '=', 'equal']
ranking_words = ['top', 'bottom']
chart_words = ['chart', 'graph', 'report']
ignore_words = ['a', 'the', 'as', 'my', 'an', 'for']

datetime_keywords = get_datetime_keywords('datetime_keywords.txt')

def main():
    if len(sys.argv) == 1:
        print ("No input!")
    else:
        nlp_search(sys.argv[1].lower(), insights, 60)

if __name__ == '__main__':
    main()


