import sys
import re


import numpy as np
import pandas as pd
import seaborn as sns

sys.path.append('./../../prepare_gustav/src/')
from gustav import dimensions, ncbi, figshare, github

import meta_utils, export

   
def in_scope_pubtator_genes(ignore=None):
    
    pubtator_genes = ncbi.pubtator_genes('covid19')
    
    pubtator_genes = pubtator_genes[
        pubtator_genes['pubmed_id'].notnull()]
    
    gi = ncbi.gene_info(usecols=['gene_ncbi', 'taxon_ncbi', 'type_of_gene'])
    

    gi = gi[gi['type_of_gene']=='protein-coding']
    pubtator_genes = pd.merge(
        pubtator_genes,
        gi[['gene_ncbi', 'taxon_ncbi']]
    )
    
    pubtator_genes = pubtator_genes[['pubmed_id', 'gene_ncbi', 'taxon_ncbi', 'section']]
    
    
    allowed_sections = ['TITLE', 'INTRO', 'ABSTRACT', 'RESULTS']
    pubtator_genes = pubtator_genes[pubtator_genes['section'].isin(allowed_sections)]


    p = export.get_material_path('201016_fetch_medline/extracted.xlsx')

    df = pd.read_excel(p)
    df = meta_utils.stack_by_delimiter_in_column(df, 'publication_types', '; ')

    forbidden = [
        'D016454:Review',
        'D016420:Comment',
        'D016421:Editorial',
        'D017418:Meta-Analysis',
        'D000078182:Systematic Review',
        'D016433:News',
        'D016425:Published Erratum',
        'D016456:Historical Article',
        'D017203:Interview',
        'D016441:Retracted Publication',
        'D016440:Retraction of Publication',
        'D057405:Webcast',
        'D000075742:Expression of Concern',
        'D019477:Portrait'
    ]

    forbidden_pmid = df[df['publication_types'].isin(forbidden)]['pmid']

    pubtator_genes = pubtator_genes[
        pubtator_genes['pubmed_id'].isin(df[~df['pmid'].isin(forbidden_pmid)]['pmid'])]

    forbidden_pmid = df[df['abstract'].str.contains('this review|this perspective', case=False, na=False)]['pmid']

    pubtator_genes = pubtator_genes[
        pubtator_genes['pubmed_id'].isin(df[~df['pmid'].isin(forbidden_pmid)]['pmid'])]

    if ignore is not None:
        pubtator_genes = pubtator_genes[
            ~pubtator_genes['pubmed_id'].isin(ignore)]
    
    return pubtator_genes
    
    
def attention_covid_19(ignore=None):
    """
    Returns the attention warranted in the COVID-19 literature
    
    Filters fro protein-coding genes, and consideres
    each mention in LitCOVID to equally contribute to the attention
    e.g.: if one gene is mentioned twice in a paper and
    there is a second gene mentioned once in the same paper,
    the first gene will receive attention of 2/3, and the second
    gene will receive attention of 1/3
    
    Input:
        ignore   list, optional: studies to exclude
    
    """
    
    pubtator_genes = in_scope_pubtator_genes(ignore=ignore)

    pubtator_genes_attention = pd.merge(
        pubtator_genes,
        (1/pubtator_genes['pubmed_id'].value_counts()
            ).to_frame('attention').rename_axis('pubmed_id').reset_index())



    attention_by_gene = pubtator_genes_attention[['gene_ncbi', 'taxon_ncbi', 'attention']].groupby(
        ['taxon_ncbi', 'gene_ncbi']).agg(sum).sort_values('attention', ascending=False).reset_index()

    
    return attention_by_gene

