#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 13 12:26:31 2018

@author: franckladam
"""

def search(input_file, start = 1, end = 200):

    """ this function counts motif occurences for TALE Hox and NFY factors in a liste of DNA sequences"""
	#create a list containing the sequences
    with open(input_file) as seq_object:
        seq_file_lines = seq_object.readlines()
        seq = []
        seq_portion = []
        for line in seq_file_lines:
            seq.append(line)
    seqlist_length = len(seq)
    
    print("\n\n{} sequences from {} are being analyzed".format(seqlist_length, input_file))

    for line in seq: #creates a list of sequences of end - start bp long
            line.strip()
            len(line)
            line_portion = line[start-1:end]
            seq_portion.append(line_portion)

    # a list of variables required downstream
    motif_counts1 = 0
    motif_counts2 = 0
    motif_counts3 = 0
    motif_counts4 = 0
    motif_counts5 = 0
    motif_counts6 = 0
    motif_counts7 = 0
    motif_counts8 = 0
    
    zero_list = []
    noDECA_list = []
    zero_list2 = []
    zero_list3 = []
    zero_list4 = []

    import re
    
    # motifs to be searched
    motif1 = re.compile("TGA[GCT]TGACA[GCT]") # DECA forward (IUPAC notation: TGACTGACAB)
    motif2 = re.compile("[AGC]TGTCA[AGC]TCA") #Input motif DECA reverse complement
    motif5 = re.compile("TGAT[ACGT][ACGT]AT") # HOX forward
    motif6 = re.compile("AT[ACGT][ACGT]ATCA") # HOX reverse
    motif7 = re.compile("[AG]CCAAT[AC]") #NFY forward
    motif8 = re.compile("[TG]ATTGG[TC]") #NFY reverse

    #extract sequences with forward DECA motif
    for line in seq_portion:    
        match1 = re.search(motif1, line)
        if match1:
            motif_counts1 = motif_counts1 + 1
        else:
            zero_list.append(line) #store the sequence that do not have the motif in the "zero_list" list

    for line in zero_list: #uses the "zero_list" list to search for motif reverse complement
        match2 = re.search(motif2, line)
        if match2:
            motif_counts2 = motif_counts2 + 1
        else:
            noDECA_list.append(line)


    total_motifs = motif_counts1 + motif_counts2
    percent = round((total_motifs * 100)/seqlist_length, 3)

    print("\n\nthe DECA forward motif (TGABTGACAB) is found in {} sequence(s)".format(motif_counts1))
    print("the DECA reverse complement (VTGTCAVTCA) is found in {} of the remaining sequence(s)".format(motif_counts2))
    print("\n\tTHE MOTIF (forward and reverse) IS FOUND IN {} OUT OF {} INPUT SEQUENCES SO ABOUT {}%".format(total_motifs, seqlist_length, percent))

    #now work with the noDECA list as INPUT to find sequences with HEXA motifs

    for line in noDECA_list:
        count3 = line.count("TGACAG")
        if count3 == 0:
            zero_list2.append(line)
        else:
            motif_counts3 = motif_counts3 + 1


    for line in zero_list2:
        count4 = line.count("CTGTCA")
        if count4 >= 1:
            motif_counts4 = motif_counts4 + 1

    total_motifsHEXA = motif_counts3 + motif_counts4
    percentHEXA = round((total_motifsHEXA * 100)/seqlist_length, 3)

    print("\n\nthe HEXA forward motif (TGACAG) is found in {} sequence(s) with no DECA".format(motif_counts3))
    print("the HEXA reverse complement (CTGTCA) is found in {} of the remaining sequence(s) with no DECA".format(motif_counts4))
    print("\n\tTHE HEXA MOTIF (forward and reverse) IS FOUND IN {} OUT OF {} INPUT SEQUENCES WITH NO DECA MOTIFS SO ABOUT {}% OF INPUT SEQUENCES".format(total_motifsHEXA, seqlist_length, percentHEXA))

	#search HOX motif in all sequences 
    
    for line in seq_portion:
        line.strip()
        match5 = re.search(motif5, line)
        if match5:
            motif_counts5 = motif_counts5 + 1
        else:
            zero_list3.append(line)

    for line in zero_list3:
        match6 = re.search(motif6, line)
        if match6:
            motif_counts6 = motif_counts6 + 1

    total_motifsHOX = motif_counts5 + motif_counts6
    percentHOX = round((total_motifsHOX * 100)/seqlist_length, 3)

    print("\n\nthe HOX forward motif (TGATNNAT) is found in {} sequence(s)".format(motif_counts5))
    print("the HOX reverse motif (ATNNATCA) is found in {} sequence(s)".format(motif_counts6))
    print("\n\tTHE HOX MOTIF (forward and reverse) IS FOUND IN {} OUT OF {} INPUT SEQUENCES SO ABOUT {}% OF INPUT SEQUENCES".format(total_motifsHOX, seqlist_length, percentHOX))


	#search NFY motif in all sequences

    for line in seq_portion:
        line.strip()
        match7 = re.search(motif7, line)
        if match7:
            motif_counts7 = motif_counts7 + 1
        else:
            zero_list4.append(line)

    for line in zero_list4:
        match8 = re.search(motif8, line)
        if match8:
            motif_counts8 = motif_counts8 + 1

    total_motifsNFY = motif_counts7 + motif_counts8
    percentNFY = round((total_motifsNFY * 100)/seqlist_length, 3)

    print("\n\nthe NFY forward motif (RCCAATM) is found in {} sequence(s)".format(motif_counts7))
    print("the NFY reverse motif (KTTGGY) is found in {} sequence(s)".format(motif_counts8))
    print("\n\tTHE NFY MOTIF (forward and reverse) IS FOUND IN {} OUT OF {} INPUT SEQUENCES SO ABOUT {}% OF INPUT SEQUENCES".format(total_motifsNFY, seqlist_length, percentNFY))
