##date:2021-01-09 author:Tiago Rodrigues Ferreira
##Code for preparing files for irradiated and untreated Ltrop scRNA-seq data analysis from Louradour et al. 2021

#fastqc to analyze read QC

module load fastqc/0.11.8-Java-1.8.0_45

fastqc /path/to/file/*.gz -o /path/to/file/QC/ -a /path/to/file/adapters.tab

#fastp to trim low quality reads

module load fastp/0.20.1

fastp -i /path/to/file/irradiated1_S1_L002_R1_001.fastq.gz -o /path/to/file/irradiated1_S1_L002_R1_001.fastq -I /path/to/file/irradiated1_S1_L002_R2_001.fastq.gz -O /path/to/file/irradiated1_S1_L002_R2_001.fastq --length_required 20 --average_qual 20 --detect_adapter_for_pe --correction -h /path/to/file/filtered/irradiated1.html -j /path/to/file/filtered/irradiated1.json

#prepare fasta file for cellranger
#clean up chromosome headers to keep only LmjF.XX and add the maxicircle kDNA to the fasta
#nuclear genome files from tritrypdb.org and maxicircle kDNA from leish-esp.cbm.uam.es

cat TriTrypDB-50_LmajorFriedlin_Genome.fasta | sed 's/ | o.*//' > LmjFv50.fa
cat mitochondria/LmjF_cbm_maxicircle.fasta >> LmjFv50.fa

#preparing gtf file for cellranger
#generate GTF file from gff

module load cufflinks/2.2.1-goolf-1.7.20

gffread gff/TriTrypDB-50_LmajorFriedlin.gff -F -T -o LmjFv50.gtf

#keep only CDS features, replace 'CDS' for 'exon' in column 3
grep -E 'CDS' LmjFv50.gtf | sed  's/CDS/exon/' > LmjFv50_v2.gtf

#adjust gtf with cellranger
module load cellranger/5.0.0
cellranger mkgtf LmjFv50_v2.gtf LmjFv50_cellranger.gtf --attribute=key:allowable_value


#prepare UTR coordinates available from Dillon et al. NAR 2015
#split data into minus or plus strand datasets
grep '-' 3UTR_coord.txt > 3UTR_coord_minus.txt
grep '+' 3UTR_coord.txt > 3UTR_coord_plus.txt

#sort the coordinates by column 1-> col4 -> col3 and then check if col3 is the first one in each subgroup of lines with the same column 1 value (Gene ID) and print it. r for descending order. Create matrix a, if the first number is the one in $3 then print.

sort -k1,1n -k4,4n -k3,3r 3UTR_coord_plus.txt | awk '!a[$1] {a[$1] = $3} $3 == a[$1]' > 3UTR_coord_plus_v2.txt
sort -k1,1n -k4,4n -k3,3n 3UTR_coord_minus.txt | awk '!a[$1] {a[$1] = $3} $3 == a[$1]' > 3UTR_coord_minus_v2.txt

#look for the 3UTR start coordinate +1 (=stop codon in the CDS annotation) in the gtf file then, print the lines with the new column 4 data (updated UTR annotation from Dillon et al), else there is no data in the Dillon et al paper then print the same line. Have to do this for minus and then plus separately
awk -v s=1 'BEGIN{FS="\t";OFS="\t"} NR==FNR{a[$4+s]=$3;next;} ($7=="-" && $4 in a) {print $1,$2,$3,a[$4],$5,$6,$7,$8,$9} ($7=="-" && $4 in a == 0) {print $1,$2,$3,$4,$5,$6,$7,$8,$9}' 3UTR_coord_minus_v2.txt LmjFv50_cellranger.gtf > LmjFv50_cellranger_minus.gtf

awk -v s=1 'BEGIN{FS="\t";OFS="\t"} NR==FNR{a[$4-s]=$3;next;} ($7=="+" && $5 in a) {print $1,$2,$3,$4,a[$5],$6,$7,$8,$9} ($7=="+" && $5 in a == 0) {print $1,$2,$3,$4,$5,$6,$7,$8,$9}' 3UTR_coord_plus_v2.txt LmjFv50_cellranger.gtf > LmjFv50_cellranger_plus.gtf

#combine both files
cat LmjFv50_cellranger_plus.gtf LmjFv50_cellranger_minus.gtf | sort -k1,1n > LmjFv50_cellranger_final.gtf

#add kDNA maxi circle whole annotation to the gtf file. It might need \n at the beginning to add as a new line.
echo -e 'LmjF_maxicircle\tCBM\texon\t1\t18998\t.\t+\t.\ttranscript_id "LmjF.maxi"; gene_id "LmjF.maxi";' >> LmjFv50_cellranger_final.gtf

#make transcriptome reference with cellranger 
cellranger mkref --genome=LmjFv50_3UTR --fasta=LmjFv50.fa --genes=LmjFv50_cellranger_final.gtf


#run cellranger count
cellranger count --id=irradiated1 --transcriptome=/path/to/file/LmjFv50_3UTR --fastqs=/path/to/file/filtered/ --sample=irradiated1 --jobmode=local --localcores=8 --localmem=20
cellranger count --id=irradiated2 --transcriptome=/path/to/file/LmjFv50_3UTR --fastqs=/path/to/file/filtered/ --sample=irradiated2 --jobmode=local --localcores=8 --localmem=20
cellranger count --id=untreated1 --transcriptome=/path/to/file/LmjFv50_3UTR --fastqs=/path/to/file/filtered/ --sample=untreated1 --jobmode=local --localcores=8 --localmem=20
cellranger count --id=untreated2 --transcriptome=/path/to/file/LmjFv50_3UTR --fastqs=/path/to/file/filtered/ --sample=untreated2 --jobmode=local --localcores=8 --localmem=20



