# Repeat Annotation code that was used for annotating the whale shark genome
# Repeat Annotation process was modified from http://weatherby.genetics.utah.edu/MAKER/wiki/index.php/Repeat_Library_Construction-Advanced

# Genome assembly file needed as shark.fasta
# Final repeat files provided as supplementary file

# mdust required for MITE Hunter: https://github.com/lh3/mdust

# Install MITE Hunter
# Here, installed to ~/bin/MITE_Hunter/
# All other settings default, -P indicates proportion of genome to look at because of memory intensiveness since over 1 GB
nohup perl ~/bin/MITE_Hunter/MITE_Hunter_manager.pl -i shark.all.pilon.fasta -n 16 -S 12345678 -P .2 > mite_hunter.out &

# Install genometools
# brew install genometools

sed 's/_//' shark.fasta > seqfile
gt suffixerator -db seqfile -indexname seqfileindex -tis -suf -lcp -des -ssp –dna
gt ltrharvest -index seqfileindex -out seqfile.out99 -outinner seqfile.outinner99 -gff3 seqfile.gff99 -minlenltr 100 -maxlenltr 6000 -mindistltr 1500 -maxdistltr 25000 -mintsd 5 -maxtsd 5 -motif tgca -similar 99 -vic 10  > seqfile.result99 

#These two commands will enable the collection of sequences with following features:
#Two terminal repeats which are >= 99% similar, ranging from 100 bp tp 6000 bp;
#The two terminal repeats end with “TG…CA”;
#The size of entire element ranges from 1.5 kb to 25 kb;
#The element must be flanked by a 5bp TSD (target site duplication).
#The TSD is within 10 bp from the end of the element.

# Process...

gt gff3 -sort seqfile.gff99 > seqfile.gff99.sort

# Download eukaryotic-tRNAs.fa from http://lowelab.ucsc.edu/GtRNAdb/download.html
gt ltrdigest -trnas eukaryotic-tRNAs.fa seqfile.gff99.sort seqfileindex > seqfile.gff99.dgt

# Get CRL Scripts for further processing
wget http://www.hrt.msu.edu/uploads/535/78637/CRL_Scripts1.0.tar.gz
tar -xzf CRL_Scripts1.0.tar.gz
perl CRL_Scripts1.0/CRL_Step1.pl --gff seqfile.gff99.dgt

perl CRL_Scripts1.0/CRL_Step2.pl --step1 CRL_Step1_Passed_Elements.txt --repeatfile seqfile.out99 --resultfile seqfile.result99 --sequencefile seqfile --removed_repeats CRL_Step2_Passed_Elements.fasta

# Run Step3 in a different directory
mkdir fasta_files
mv Repeat_*.fasta  fasta_files
mv CRL_Step2_Passed_Elements.fasta fasta_files
cd fasta_files
perl CRL_Scripts1.0/CRL_Step3.pl --directory fasta_files --step2 CRL_Step2_Passed_Elements.fasta --pidentity 60 --seq_c 25 
mv CRL_Step3_Passed_Elements.fasta ..
cd ..

# LTR_Library.pl
perl CRL_Scripts1.0/ltr_library.pl --resultfile seqfile.result99 --step3 CRL_Step3_Passed_Elements.fasta --sequencefile seqfile
cat genome_Step8_* > MITE.lib
cat lLTR_Only.lib MITE.lib > repeats_to_mask_LTR99.fasta

# Run RepeatMasker
RepeatMasker -lib repeats_to_mask_LTR99.fasta  -nolow -dir . seqfile.outinner99

perl CRL_Scripts1.0/cleanRM.pl seqfile.outinner99.out seqfile.outinner99.masked > seqfile.outinner99.unmasked
perl CRL_Scripts1.0/rmshortinner.pl seqfile.outinner99.unmasked 50 > seqfile.outinner99.clean

# Get Tpases to find them
wget http://www.hrt.msu.edu/uploads/535/78637/Tpases020812.gz
wget http://www.hrt.msu.edu/uploads/535/78637/Tpases020812DNA.gz
gunzip Tpases020812.gz
gunzip Tpases020812DNA.gz
makeblastdb -in Tpases020812DNA -dbtype prot

# BLAST the sequences against Tpases
nohup blastx -query seqfile.outinner99.clean -db TpasesDNA/Tpases020812DNA -evalue 1e-10 -num_descriptions 10 -out seqfile.outinner99.clean_blastx.out.txt > blastx_tpasedna.txt &
perl CRL_Scripts1.0/outinner_blastx_parse.pl --blastx seqfile.outinner99.clean_blastx.out.txt --outinner seqfile.outinner99
perl CRL_Scripts1.0/CRL_Step4.pl –step3 CRL_Step3_Passed_Elements.fasta --resultfile seqfile.result99 --innerfile passed_outinner_sequence.fasta --sequencefile seqfile

# Harvest LTRs
gt ltrharvest -index seqfileindex -out seqfile.out85 -outinner seqfile.outinner85 -gff3 seqfile.gff85 -minlenltr 100 -maxlenltr 6000 -mindistltr 1500 -maxdistltr 25000 -mintsd 5 -maxtsd 5 -vic 10  > seqfile.result85 # Default -similar is 85%

nohup perl CRL_Scripts1.0/ltr_library.pl --resultfile seqfile.result85 --step CRL_Step3_Passed_Elements.fasta --sequencefile seqfile > ltr_lib.out &
perl CRL_Scripts1.0/cleanRM.pl seqfile.outinner85.out seqfile.outinner85.masked > seqfile.outinner85.unmasked
nohup perl CRL_Scripts1.0/rmshortinner.pl seqfile.outinner85.unmasked 50 > seqfile.outinner85.clean &
nohup perl CRL_Scripts1.0/CRL_Step4.pl -step3 CRL_Step3_Passed_Elements.fasta --resultfile seqfile.result85 --innerfile passed_outinner_sequence.fasta --sequencefile seqfile > crl_step4.out &
makeblastdb -in lLTRs_Seq_For_BLAST.fasta -dbtype nucl
blastn -query lLTRs_Seq_For_BLAST.fasta -db lLTRs_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out lLTRs_Seq_For_BLAST.fasta.out
makeblastdb -in Inner_Seq_For_BLAST.fasta -dbtype nucl
blastn -query Inner_Seq_For_BLAST.fasta -db Inner_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out Inner_Seq_For_BLAST.fasta.out
nohup perl CRL_Scripts1.0/CRL_Step5.pl --LTR_blast lLTRs_Seq_For_BLAST.fasta --inner_blast Inner_Seq_For_BLAST.fasta.out --step3 CRL_Step3_Passed_Elements.fasta --final LTR85.lib --pcoverage 90 --pidentity 80 > crl_step5.out &
# Because no LTR's made it through 99%, no LTR99.lib file to add or filter

cat LTR85.lib MITE.lib > allMITE_LTR.lib

# To do repeatmasking, because the genome is fairly large, we will split the file
# Use faSplit from http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/

faSplit sequence seqfile 20 seqfile_p

# Repeat on every seqfile
nohup RepeatMasker -lib allMITE_LTR.lib -dir . seqfile_p09.fa > rm_p09.out &

# Combine repeat masked genome
cat seqfile_p*masked > seqfile.masked
perl CRL_Scripts1.0/rmaskedpart.pl seqfile.masked 50 > umseqfile
~/bin/RepeatModeler/BuildDatabase -name umseqfiledb -engine ncbi umseqfile
nohup ~/bin/RepeatModeler/RepeatModeler -database umseqfiledb > umseqfile.out &

cd RM_12116.MonNov211714322016
perl ../CRL_Scripts1.0/repeatmodeler_parse.pl --fastafile consensi.fa.classified --unknowns repeatmodeler_unknowns.fasta --identities repeatmodeler_identities.fasta

mkdir Tpases
mv Tpases020812 Tpases
cd Tpases
makeblastdb -in Tpases020812 -dbtype prot

cd ../RM_12116.MonNov211714322016

nohup blastx -query repeatmodeler_unknowns.fasta -db ../Tpases/Tpases020812 -evalue 1e-10 -num_descriptions 10 -out modelerunknown_blast_results.txt > tpase_blast.out &
perl ../CRL_Scripts1.0/transposon_blast_parse.pl --blastx modelerunknown_blast_results.txt --modelerunknown repeatmodeler_unknowns.fasta 

mv unknown_elements.txt ModelerUnknown.lib
cat identified_elements.txt repeatmodeler_identities.fasta > ModelerID.lib


# ID some of the unknown repeats with Dfam

# Classify unclassifieds from RepeatClassifier using Dfam:
# Downloaded 27 Feb 2017
mkdir db
wget http://dfam.org/web_download/Current_Release/Dfam.hmm.gz
gunzip Dfam.hmm.gz
wget http://dfam.org/web_download/Current_Release/Dfam.hmm.h3f
wget http://dfam.org/web_download/Current_Release/Dfam.hmm.h3i
wget http://dfam.org/web_download/Current_Release/Dfam.hmm.h3m
wget http://dfam.org/web_download/Current_Release/Dfam.hmm.h3p
wget http://dfam.org/web_download/Current_Release/Dfam.version
wget http://dfam.org/web_download/Current_Release/relnotes.txt
mv relnotes.txt Dfam_relnotes.txt
wget http://dfam.org/web_download/Current_Release/Dfam.seed.gz
cd ..

# In repeat analysis folder:
nohup nhmmer --tblout dfam.modelerunknown.tbl --dfamtblout dfam.modelerunknown.dfamtbl db/Dfam.hmm ModelerUnknown.lib > dfam.modelerunknown.out 2> dfam.modelerunknown.err &

##### Parse Dfam results and rename repeats that hit to a single repeat superfamily:
# Get the names so you can rename them

for i in `grep -v '^#' dfam.modelerunknown.tbl | awk '{print $4}'`; do
	class=`grep -A30 "$i" db/Dfam.hmm | grep 'CT' | tail -n2 | tr '\n' ' ' | sed 's/CT.*Class; //' | sed 's/; CT.*Superfamily; /\//' | tr -d ';'`
	echo "$i" "$class"
	done > accs_classes0.txt
sed 's/Cut and Paste/DNA/' accs_classes0.txt | sed 's/\/Undefined//' | sed 's/ tRNA/ SINE\/tRNA/' > accs_classes.txt

paste -d'@' <(grep -v '^#' dfam.modelerunknown.tbl | awk '{print $1}') <(cut -f2 -d' ' accs_classes.txt) | sort | uniq > dfam.id.modelerunknown.txt

cut -f1 -d'@' dfam.id.modelerunknown.txt | uniq -c | grep ' 1 ' | awk '{print $2}' | grep -f - dfam.id.modelerunknown.txt > dfam.id.modelerunknown.sub.txt

sed 's/\[/\\[/' dfam.id.modelerunknown.sub.txt | sed 's/\]/\\]/' | sed 's/\//^/' > dfam.id.modelerunknown.sub.rn.txt

cp ModelerUnknown.lib ModelerUnknown.lib.dfamclassified
for i in `cat dfam.id.modelerunknown.sub.rn.txt`; do
	seq=`echo $i | cut -f1 -d'@' | cut -f1 -d'#'`
	class=`echo $i | cut -f2,3 -d'@'`
	sed -i "s/$seq#Unknown/$seq#$class/" ModelerUnknown.lib.dfamclassified
	done
sed -i 's/\^/\//' ModelerUnknown.lib.dfamclassified

# Use faSomeRecords from http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/ to get sequences
~/bin/faSomeRecords ModelerUnknown.lib.dfamclassified <(grep 'Unknown' ModelerUnknown.lib.dfamclassified | tr -d '>') ModelerUnknown.Dfamunknown.lib
~/bin/faSomeRecords ModelerUnknown.lib.dfamclassified <(grep -v 'Unknown' ModelerUnknown.lib.dfamclassified | tr -d '>') ModelerUnknown.DfamID.lib


# Download Uniprot/Uniref for Tpases to exclude from search
# Download Uniref (v90 was downloaded here), put in db folder
# CASIC was used:
blastp -query db/uniref90.fasta -db Tpases020812 -max_target_seqs 1 -outfmt 6 -evalue 1e-5 -num_threads 16 -out uniref90_2_Tpases_blast.out > uniref90_2_Tpases.out

for i in `ls *sub.faa.gz`; do
	rep=`basename "$i" sub.faa.gz`
	echo "blastp -db ../Tpases/Tpases020812 -max_target_seqs 1 -outfmt 6 -evalue 1e-5 -num_threads 10 -out "$rep".tpase.out > "$rep"_tpase.out &"
	done | sh

cut -f1 uniref90_2_Tpases_blast.out | sort | uniq > tmp.txt
~/bin/faSomeRecords -exclude db/uniref90.fasta tmp.txt uniref90_notp.fa

# faSomeRecords for BLAST hits without Tpase hits
for i in `ls *tp.out`; do
	num=`basename "$i" .protein.est_tp.out | sed 's/vertebrate_other\.//'`
	zcat vertebrate_other."$num".protein.est.faa.gz > tmp.faa
	cut -f1 "$i" | sort | uniq > tmp.txt
	~/bin/faSomeRecords -exclude tmp.faa tmp.txt vertebrate_other."$num".protein.est.notp.faa
	gzip vertebrate_other."$num".protein.est.notp.faa
	rm tmp*
	done


#cat uniref90_notp.fa <(zcat vertebrate_other*notp.faa.gz) > uniref_refseq_prot_notp.fa
#
#makeblastdb -in uniref_refseq_prot_notp.fa -dbtype prot
#cd ../RM_12116.MonNov211714322016/
#nohup blastx -query ModelerUnknown.lib -db ../prot_db/uniref_refseq_prot_notp.fa -evalue 1e-10 -num_descriptions 10 -num_threads 10 -out ModelerUnknown.lib_blast_results.txt > modelerunknown_blast_results.txt &

makeblastdb -in uniref90_notp.fa -dbtype prot
cd ../RM_12116.MonNov211714322016/
nohup blastx -query ModelerUnknown.Dfamunknown.lib -db ../prot_db/uniref90_notp.fa -evalue 1e-10 -num_descriptions 10 -num_threads 10 -out ModelerUnknown.Dfamunknown.lib_blast_results.txt > modelerunknown.dfamunknown_blast_results.txt &

~/bin/ProtExcluder1.2/ProtExcluder.pl ModelerUnknown.Dfamunknown.lib_blast_results.txt ModelerUnknown.Dfamunknown.lib

cd ..






# Classify MITE_LTR results
nohup ~/bin/RepeatModeler/RepeatClassifier -consensi allMITE_LTR.lib -engine ncbi > allmiteltr.classify.out &
# rename remaining unknowns appropriately
# LTR
# MITEs are all DNA transposons, so we'll tag them as such.

~/bin/faSomeRecords allMITE_LTR.lib.classified <(cat allMITE_LTR.lib.classified | grep Unknown | tr -d '>') allMITE_LTR_unknowns.fasta
~/bin/faSomeRecords allMITE_LTR.lib.classified <(cat allMITE_LTR.lib.classified | grep -v Unknown | tr -d '>') allMITE_LTR_identities.fasta

# Classify with Dfam
nohup nhmmer --tblout dfam.allmiteltr.tbl --dfamtblout dfam.allmiteltr.dfamtbl db/Dfam.hmm allMITE_LTR_unknowns.fasta > dfam.allmiteltr.out 2> dfam.allmiteltr.err &

##### Parse Dfam results and rename repeats that hit to a single repeat superfamily:
# Get the names so you can rename them

for i in `grep -v '^#' dfam.allmiteltr.dfamtbl | awk '{print $2}'`; do
	class=`grep -A30 "$i" db/Dfam.hmm | grep 'CT' | tail -n2 | tr '\n' ' ' | sed 's/CT.*Class; //' | sed 's/; CT.*Superfamily; /\//' | tr -d ';'`
	echo "$i" "$class"
	done | sed 's/Cut and Paste/DNA/' | sed 's/\/Undefined//' | sed 's/tRNA/SINE\/tRNA/' > accs_classes.txt

paste -d'@' <(grep -v '^#' dfam.allmiteltr.dfamtbl | awk '{print $1}') <(cut -f2 -d' ' accs_classes.txt) | sort | uniq > dfam.id.miteltr.txt

cut -f1 -d'@' dfam.id.miteltr.txt | uniq -c | grep ' 1 ' | awk '{print $2}' | grep -f - dfam.id.miteltr.txt > dfam.id.miteltr.sub.txt


sed 's/\[/\\[/' dfam.id.miteltr.sub.txt | sed 's/\]/\\]/' | sed 's/\//^/' > dfam.id.miteltr.sub.rn.txt

cp allMITE_LTR_unknowns.fasta allMITE_LTR_unknowns.fasta.dfamclassified
for i in `cat dfam.id.miteltr.sub.rn.txt`; do
	seq=`echo $i | cut -f1 -d'@' | cut -f1 -d'#'`
	class=`echo $i | cut -f2,3 -d'@'`
	sed -i "s/$seq#Unknown/$seq#$class/" allMITE_LTR_unknowns.fasta.dfamclassified
	done
sed -i 's/\^/\//' allMITE_LTR_unknowns.fasta.dfamclassified


# Combine RepeatModeler and MITE_LTR results

cat allMITE_LTR_identities.fasta allMITE_LTR_unknowns.fasta.dfamclassified RM_12116.MonNov211714322016/ModelerID.lib RM_12116.MonNov211714322016/ModelerUnknown.DfamID.lib > KnownRepeats.lib
nohup blastx -query KnownRepeats.lib -db prot_db/uniref_refseq_prot_notp.fa -evalue 1e-10 -num_descriptions 10 -num_threads 10 -out KnownRepeats.lib_blast_results.txt > knownrepeats_blast_results.txt &

sed 's/pilon_(.*/pilon/g' KnownRepeats.lib > KnownRepeats.renamed.lib
sed 's/pilon_(.*/pilon/g' KnownRepeats.lib_blast_results.txt > KnownRepeats.lib_blast_results.renamed.txt

~/bin/ProtExcluder1.2/ProtExcluder.pl KnownRepeats.lib_blast_results.renamed.txt KnownRepeats.renamed.lib

cat RM_12116.MonNov211714322016/ModelerUnknown.libnoProtFinal KnownRepeats.renamed.libnoProtFinal > allRepeats.lib


# Repeat chart:
# We'll use R

# LINEs are from RepeatMasker and from subsequent BLAST of unknown repeats from RepeatMasker to Transposases
repeats <- c(4,24+67,28+3,188,2,2,3,2,15,12,340)
names(repeats) <- c("MITE","LTR","DNA transposons","LINE","RC","rRNA","Satellite","Simple repeats","tRNAs","Uncertain SINEs","Unknown (Novel)")
par(mar=c(8,4,2,2))
barplot(sort(repeats,decreasing = TRUE),las=2,ylab="Number of repeats families")


# Get Repeat chart for whole genome
# RepeatMasker outputs formatted tbl files
# Because of the size of your genome, much better to split it
# This means you will have to combine your tbl files later
# We'll do a rush job for now just to get some numbers (-qq). This is 4-10x faster than default

# If you want RepeatMasker to count all LTR's, label them appropriately
# LTRharvest just uses the contig name, so you can easily just sed out the 'pilon'
sed 's/pilon/#LTR/' ../allRepeats.lib > ../allRepeats.ann.lib

nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p00.fa > rm_p00.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p01.fa > rm_p01.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p02.fa > rm_p02.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p03.fa > rm_p03.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p04.fa > rm_p04.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p05.fa > rm_p05.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p06.fa > rm_p06.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p07.fa > rm_p07.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p08.fa > rm_p08.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p09.fa > rm_p09.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p10.fa > rm_p10.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p11.fa > rm_p11.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p12.fa > rm_p12.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p13.fa > rm_p13.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p14.fa > rm_p14.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p15.fa > rm_p15.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p16.fa > rm_p16.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p17.fa > rm_p17.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p18.fa > rm_p18.out &
nohup RepeatMasker -lib ../allRepeats.ann.lib -gff -qq -dir . ../seqfile_p19.fa > rm_p19.out &