import sys
import re


import numpy as np
import pandas as pd


sys.path.append('./../../prepare_gustav/src/')
from gustav import dimensions, ncbi


def gini(list_of_values):
    """
    Computes Gini Coefficient,
    ranging between 0 (no inequality) and 1 (absolute inequality)
    Input:
        list_of_values      list with values
    Output:
        gini_coefficient    float, Gini Coefficient,
    """

    # Modified after:
    # http://planspace.org/2013/06/21/
    # how-to-calculate-gini-coefficient-from-raw-data-in-python/

    sorted_list = sorted(list_of_values)
    height, area = 0, 0
    for value in sorted_list:
        height += value
        area += height - value / 2.

    length_of_list = len(list_of_values)

    if length_of_list > 0:
        fair_area = height * length_of_list / 2.
        result = (fair_area - area) / fair_area
    else:
        raise ValueError('Gini Coefficient is not defined for empty lists.')

    return result


def taxa_extended():
    """
    Returns table with taxon_ncbi, taxon_name, and
    inferred columns, e.g.: viruses, coronaviridae
    if these are parents
    
    """

    nodes = ncbi.taxonomy('nodes').set_index('taxon_ncbi')

    track_viruses = nodes[['parent_taxon_ncbi']].copy().reset_index().set_index('parent_taxon_ncbi')

    track_viruses.loc[:, 'viruses'] = False
    track_viruses.loc[:, 'coronaviridae'] = False



    seek = [10239]  # Viruses
    while len(seek)>0:
        seek = [x for x in seek if x in track_viruses.index]
        track_viruses.loc[seek, 'viruses'] = True
        seek = list(set(track_viruses.loc[seek, 'taxon_ncbi']))



    seek = [2499399]  # Coronaviridae
    while len(seek)>0:
        seek = [x for x in seek if x in track_viruses.index]
        track_viruses.loc[seek, 'coronaviridae'] = True
        seek = list(set(track_viruses.loc[seek, 'taxon_ncbi']))    

    track_viruses = track_viruses.reset_index(drop=True)

    names = ncbi.taxonomy('names')

    taxa = pd.merge(
        names[['taxon_ncbi', 'taxon_name']],
        track_viruses
    )
    return taxa

def human_protein_coding_genes():
    """
    Returns list of all human protein-coding genes. 
    """
    
    gi = ncbi.gene_info(9606, ['gene_ncbi', 'type_of_gene'])
    gi = gi[gi['type_of_gene']=='protein-coding']
    genes = list(gi['gene_ncbi'].values)
    return genes



def stack_by_delimiter_in_column(df, column, delimiter):
    """
    Stacks dataframe according to delimiter in column

    Input:
        df          dataframe
        column      column with delimiter
        delimiter   delimiter (note: no regular expression)

    Output:
        stacked_df  stacked dataframe

    """

    df.loc[:, column] = df.loc[:, column].astype(str)

    orig_index_name = df.index.name
    orig_column_order = df.columns

    df.index.name = 'original_index_used_before_splitting'
    df = df.reset_index()
    df.index.name = 'helper_index'

    f = (df[column].str.contains(
        delimiter, regex=False)) | (df[column].isnull())
    df_no_delimiter = df[~f]
    df_with_delimiter = df[f]

    ser_with_delimiter = df.loc[:, column]

    agg_values = []
    agg_indices = []

    for i, v in ser_with_delimiter.iteritems():
        vi = v.split(delimiter)
        indices = [i] * len(vi)

        agg_values.append(vi)
        agg_indices.append(indices)

    agg_values = flatten(agg_values)
    agg_indices = flatten(agg_indices)

    g = pd.DataFrame(data={'helper_index': agg_indices, column: agg_values})

    df_with_delimiter = pd.merge(
        df_with_delimiter.drop(column, 1).reset_index(),
        g)

    joined = pd.concat([
        df_no_delimiter.reset_index(),
        df_with_delimiter],
        sort=True
    )

    joined = joined.sort_values(
        ['original_index_used_before_splitting', column])
    joined = joined.drop('helper_index', 1)
    joined = joined.set_index('original_index_used_before_splitting')
    joined.index.name = orig_index_name
    joined = joined.loc[:, orig_column_order]

    return joined


def flatten(l):
    return [item for sublist in l for item in sublist]
