# Summarize orthogroup data 

# Orthogroups.txt is a result from OrthoFinder (provided as supplementary file)
# Protein and Gene ID tables are written from previous step from R: named ens_[speciescode]_prot_genes.txt

cp hsap.Families.Strict.2R.rn.txt hsap.Families.Strict.2R.pep.txt
while read k i j; do sed -i 's/\(\b\)'$j'\(\b\)/\1'$i'\2/' hsap.Families.Strict.2R.pep.txt; done < ens_hsap_prot_genes.txt
for i in `cat -n hsap.Families.Strict.2R.pep.txt | awk '{print $1}'`; do
j=`head -n "$i" hsap.Families.Strict.2R.pep.txt | tail -n1 | cut -f2- | tr -d ',|' | tr '\t' '\n' | tr ' ' '\n' | sed '/^$/d' | grep -f - ../Orthogroups.txt | cut -f1 -d':'`
for k in `echo "$j" | cut -f1`; do
echo -e "$i\t$k"
done
done > ohno2ortho_hsap_strict.txt
# summarize 1:1 matching between ohnologs and OrthoFinder gene families
cut -f1 ohno2ortho_hsap_strict.txt | sort | uniq -c | awk '{print $1}' | sort | uniq -c | sort -nk2,2

# Repeat for other species:
cp acar.Families.Strict.2R.rn.txt acar.Families.Strict.2R.pep.txt
while read k i j; do sed -i 's/\(\b\)'$j'\(\b\)/\1'$i'\2/' acar.Families.Strict.2R.pep.txt; done < ens_acar_prot_genes.txt
for i in `cat -n acar.Families.Strict.2R.pep.txt | awk '{print $1}'`; do
j=`head -n "$i" acar.Families.Strict.2R.pep.txt | tail -n1 | cut -f2- | tr -d ',|' | tr '\t' '\n' | tr ' ' '\n' | sed '/^$/d' | grep -f - ../Orthogroups.txt | cut -f1 -d':'`
for k in `echo "$j" | cut -f1`; do
echo -e "$i\t$k"
done
done > ohno2ortho_acar_strict.txt

cp locu.Families.Strict.2R.rn.txt locu.Families.Strict.2R.pep.txt
while read k i j; do sed -i 's/\(\b\)'$j'\(\b\)/\1'$i'\2/' locu.Families.Strict.2R.pep.txt; done < ens_locu_prot_genes.txt
for i in `cat -n locu.Families.Strict.2R.pep.txt | awk '{print $1}'`; do
j=`head -n "$i" locu.Families.Strict.2R.pep.txt | tail -n1 | cut -f2- | tr -d ',|' | tr '\t' '\n' | tr ' ' '\n' | sed '/^$/d' | grep -v 'to\|many\|(1' | grep -f - ../Orthogroups.txt | cut -f1 -d':'`
for k in `echo "$j" | cut -f1`; do
echo -e "$i\t$k"
done
done > ohno2ortho_locu_strict.txt

cp drer.Families.Strict.2R.rn.txt drer.Families.Strict.2R.pep.txt
while read k i j; do sed -i 's/\(\b\)'$j'\(\b\)/\1'$i'\2/' drer.Families.Strict.2R.pep.txt; done < ens_drer_prot_genes.txt
for i in `cat -n drer.Families.Strict.2R.pep.txt | awk '{print $1}'`; do
j=`head -n "$i" drer.Families.Strict.2R.pep.txt | tail -n1 | cut -f2- | tr -d ',|' | tr '\t' '\n' | tr ' ' '\n' | sed '/^$/d' | grep -v 'to\|many\|(1' | grep -f - ../Orthogroups.txt | cut -f1 -d':'`
for k in `echo "$j" | cut -f1`; do
echo -e "$i\t$k"
done
done > ohno2ortho_drer_strict.txt

cp mdom.Families.Strict.2R.rn.txt mdom.Families.Strict.2R.pep.txt
while read k i j; do sed -i 's/\(\b\)'$j'\(\b\)/\1'$i'\2/' mdom.Families.Strict.2R.pep.txt; done < ens_mdom_prot_genes.txt
for i in `cat -n mdom.Families.Strict.2R.pep.txt | awk '{print $1}'`; do
j=`head -n "$i" mdom.Families.Strict.2R.pep.txt | tail -n1 | cut -f2- | tr -d ',|' | tr '\t' '\n' | tr ' ' '\n' | sed '/^$/d' | grep -f - ../Orthogroups.txt | cut -f1 -d':'`
for k in `echo "$j" | cut -f1`; do
echo -e "$i\t$k"
done
done > ohno2ortho_mdom_strict.txt

# Summarize results
for i in `ls ohno2ortho*`; do echo "$i"; cut -f1 "$i" | sort | uniq -c | awk '{print $1}' | sort | uniq -c | sort -nk2,2; echo "-----"; done > counts_ohno2ortho2.txt
