import sys
import re


import numpy as np
import pandas as pd
import seaborn as sns

sys.path.append('./../../prepare_gustav/src/')
from gustav import dimensions, ncbi


def papers_in_various_resources():
    
    gene_publication = ncbi.pubtator_genes('covid19').drop_duplicates()
    dimensions_publications = dimensions.covid_19_publications_datasets_and_clinical_trials('publications')
    pubtator_publications = ncbi.pubtator_articles('covid19').drop_duplicates()
    pubtator_genes = ncbi.pubtator_genes('covid19').drop_duplicates()
    
    overall_publication_stats = {
        'dimensions': dimensions_publications['pubmed_id'].nunique(),
        'pubtator': pubtator_publications['pubmed_id'].nunique(),
        'pubtator_genes': pubtator_genes['pubmed_id'].nunique(),
        'pubtator_and_dimensions': pubtator_publications[
            ((pubtator_publications['pmc_id'].isin(dimensions_publications['pmc_id'])) | \
            (pubtator_publications['pubmed_id'].isin((dimensions_publications['pubmed_id']))))
        ]['pubmed_id'].nunique(),
        'pubtator_genes_and_dimensions': pubtator_genes[
            ((pubtator_genes['pmc_id'].isin(dimensions_publications['pmc_id'])) | \
            (pubtator_genes['pubmed_id'].isin((dimensions_publications['pubmed_id']))))
        ]['pubmed_id'].nunique()
    }
    
    overall_publication_stats = pd.Series(
        overall_publication_stats).to_frame('papers').rename_axis('category')
    order=[
            'dimensions',
            'pubtator',
            'pubtator_and_dimensions',
            'pubtator_genes',
            'pubtator_genes_and_dimensions'
        ]
    overall_publication_stats = overall_publication_stats.loc[order, :].reset_index()
    return overall_publication_stats

def dimensions_papers_per_day(last_legit_date='2020-06-10'):
    """
    Obtains the publications per day from the source:
    dimensions.covid19, and does some categorization
    of dates (in timing) as 2020-01-01 appears high,
    despite lack of subsequent research (suggesting that
    it primarily is a sink for everything unassigned)
    and as some have future publication dates.
    
    Input:
    last_legit_date     string in YYYY-MM-DD format
                        defining last legit date to be
                        included in past, rather than
                        future
    
    
    Output:
    papers_per_day  dataframe with 
                        publication_day
                        papers
                        year_month
                        timing (unassigned, past, future)
    
    """

    dimensions_publications = dimensions.covid_19_publications_datasets_and_clinical_trials('publications')
    
    papers_per_day = dimensions_publications[
        'publication_date'].value_counts().to_frame('papers').rename_axis('publication_day').reset_index()

    papers_per_day['year_month'] = papers_per_day['publication_day'].str.extract('(202[0-9]-[0-1][0-9])')

    papers_per_day['publication_day'] = papers_per_day['publication_day'].astype('datetime64')

    papers_per_day = papers_per_day.sort_values('publication_day').reset_index(drop=True)

    f = papers_per_day['publication_day'] == '2020-01-01'    # in absolute numbers this is highest, but before broad outbreak
    papers_per_day.loc[f, 'timing'] = 'unassigned'

    f = papers_per_day['publication_day'] >last_legit_date
    papers_per_day.loc[f, 'timing'] = 'future'

    f = (papers_per_day['publication_day'] <=last_legit_date) & (papers_per_day['publication_day'] > '2020-01-01')
    papers_per_day.loc[f, 'timing'] = 'past'
    
    return papers_per_day