##date:2021-01-09 author:Tiago Rodrigues Ferreira ##Code for preparing files for irradiated and untreated Ltrop scRNA-seq data analysis from Louradour et al. 2021 #fastqc to analyze read QC module load fastqc/0.11.8-Java-1.8.0_45 fastqc /path/to/file/*.gz -o /path/to/file/QC/ -a /path/to/file/adapters.tab #fastp to trim low quality reads module load fastp/0.20.1 fastp -i /path/to/file/irradiated1_S1_L002_R1_001.fastq.gz -o /path/to/file/irradiated1_S1_L002_R1_001.fastq -I /path/to/file/irradiated1_S1_L002_R2_001.fastq.gz -O /path/to/file/irradiated1_S1_L002_R2_001.fastq --length_required 20 --average_qual 20 --detect_adapter_for_pe --correction -h /path/to/file/filtered/irradiated1.html -j /path/to/file/filtered/irradiated1.json #prepare fasta file for cellranger #clean up chromosome headers to keep only LmjF.XX and add the maxicircle kDNA to the fasta #nuclear genome files from tritrypdb.org and maxicircle kDNA from leish-esp.cbm.uam.es cat TriTrypDB-50_LmajorFriedlin_Genome.fasta | sed 's/ | o.*//' > LmjFv50.fa cat mitochondria/LmjF_cbm_maxicircle.fasta >> LmjFv50.fa #preparing gtf file for cellranger #generate GTF file from gff module load cufflinks/2.2.1-goolf-1.7.20 gffread gff/TriTrypDB-50_LmajorFriedlin.gff -F -T -o LmjFv50.gtf #keep only CDS features, replace 'CDS' for 'exon' in column 3 grep -E 'CDS' LmjFv50.gtf | sed 's/CDS/exon/' > LmjFv50_v2.gtf #adjust gtf with cellranger module load cellranger/5.0.0 cellranger mkgtf LmjFv50_v2.gtf LmjFv50_cellranger.gtf --attribute=key:allowable_value #prepare UTR coordinates available from Dillon et al. NAR 2015 #split data into minus or plus strand datasets grep '-' 3UTR_coord.txt > 3UTR_coord_minus.txt grep '+' 3UTR_coord.txt > 3UTR_coord_plus.txt #sort the coordinates by column 1-> col4 -> col3 and then check if col3 is the first one in each subgroup of lines with the same column 1 value (Gene ID) and print it. r for descending order. Create matrix a, if the first number is the one in $3 then print. sort -k1,1n -k4,4n -k3,3r 3UTR_coord_plus.txt | awk '!a[$1] {a[$1] = $3} $3 == a[$1]' > 3UTR_coord_plus_v2.txt sort -k1,1n -k4,4n -k3,3n 3UTR_coord_minus.txt | awk '!a[$1] {a[$1] = $3} $3 == a[$1]' > 3UTR_coord_minus_v2.txt #look for the 3UTR start coordinate +1 (=stop codon in the CDS annotation) in the gtf file then, print the lines with the new column 4 data (updated UTR annotation from Dillon et al), else there is no data in the Dillon et al paper then print the same line. Have to do this for minus and then plus separately awk -v s=1 'BEGIN{FS="\t";OFS="\t"} NR==FNR{a[$4+s]=$3;next;} ($7=="-" && $4 in a) {print $1,$2,$3,a[$4],$5,$6,$7,$8,$9} ($7=="-" && $4 in a == 0) {print $1,$2,$3,$4,$5,$6,$7,$8,$9}' 3UTR_coord_minus_v2.txt LmjFv50_cellranger.gtf > LmjFv50_cellranger_minus.gtf awk -v s=1 'BEGIN{FS="\t";OFS="\t"} NR==FNR{a[$4-s]=$3;next;} ($7=="+" && $5 in a) {print $1,$2,$3,$4,a[$5],$6,$7,$8,$9} ($7=="+" && $5 in a == 0) {print $1,$2,$3,$4,$5,$6,$7,$8,$9}' 3UTR_coord_plus_v2.txt LmjFv50_cellranger.gtf > LmjFv50_cellranger_plus.gtf #combine both files cat LmjFv50_cellranger_plus.gtf LmjFv50_cellranger_minus.gtf | sort -k1,1n > LmjFv50_cellranger_final.gtf #add kDNA maxi circle whole annotation to the gtf file. It might need \n at the beginning to add as a new line. echo -e 'LmjF_maxicircle\tCBM\texon\t1\t18998\t.\t+\t.\ttranscript_id "LmjF.maxi"; gene_id "LmjF.maxi";' >> LmjFv50_cellranger_final.gtf #make transcriptome reference with cellranger cellranger mkref --genome=LmjFv50_3UTR --fasta=LmjFv50.fa --genes=LmjFv50_cellranger_final.gtf #run cellranger count cellranger count --id=irradiated1 --transcriptome=/path/to/file/LmjFv50_3UTR --fastqs=/path/to/file/filtered/ --sample=irradiated1 --jobmode=local --localcores=8 --localmem=20 cellranger count --id=irradiated2 --transcriptome=/path/to/file/LmjFv50_3UTR --fastqs=/path/to/file/filtered/ --sample=irradiated2 --jobmode=local --localcores=8 --localmem=20 cellranger count --id=untreated1 --transcriptome=/path/to/file/LmjFv50_3UTR --fastqs=/path/to/file/filtered/ --sample=untreated1 --jobmode=local --localcores=8 --localmem=20 cellranger count --id=untreated2 --transcriptome=/path/to/file/LmjFv50_3UTR --fastqs=/path/to/file/filtered/ --sample=untreated2 --jobmode=local --localcores=8 --localmem=20