/**********************************************************************************************************

1.  	LICENSE.  The University of York grants to you the USER a non-exclusive, non-transferable right 

        to use the SOFTWARE on file servers connected to a maximum number of user computers, or on a 

		maximum number of user computers, not exceeding the number of user computers specified either 

		on the packaging or by the accompanying letter if one is so provided, and if neither is provided 

		the maximum number shall be one.  USER agrees not to use such SOFTWARE for any commercial purpose,

		and limit use of SOFTWARE for the purposes of research only. It is further agreed that the 

		furnishing of SOFTWARE to USER shall not constitute any grant or license to USER under any legal 

		rights now or hereinafter held by University of York.  



2.  	USER agrees that USER will only copy the SOFTWARE into any machine readable or printed form as 

        necessary to use it in accordance with this Agreement or for backup purposes in support of USER’s 

		use of the SOFTWARE. 



3.  	COPYRIGHT.  The SOFTWARE is protected by copyright law and international treaty provisions.  

        USER acknowledges that no title to the intellectual property in the SOFTWARE is transferred to 

		USER.  USER further acknowledge that title and full ownership rights to the SOFTWARE will remain 

		the exclusive property of University of York, and USER will not acquire any rights to the SOFTWARE

		except as expressly set forth in this Agreement. 



4.      LIMITATIONS.  USER agrees that it will not attempt to reverse compile, modify, translate, or 

        disassemble the SOFTWARE in whole or in part.  USER may not rent, lease, transfer or sublicense 

		the SOFTWARE to third parties.  USER may not modify the SOFTWARE or create derivative works based 

		upon the SOFTWARE.  



5.  	UPGRADES.  If this copy of the SOFTWARE is an upgrade from an earlier version of the software, it 

        is provided to USER on an exchange basis.  USER agrees by its installation and use of this copy of

		the SOFTWARE to voluntarily terminate USER’s earlier version of the SOFTWARE nor transfer it to 

		another.



6.  	LIMITED WARRANTIES.  University of York warrants that the media on which the SOFTWARE is furnished 

	    will be free from defects in materials and workmanship under normal use.  



7.      DISCLAIMER OF WARRANTY: EXCEPT AS SET FORTH HEREIN, SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF 

        ANY KIND.  TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, UNIVERSITY OF YORK FURTHER DISCLAIMS 

		ALL EXPRESSED AND IMPLIED WARRANTIES, INCLUDING WITHOUT LIMITATIONS ANY WARRANTIES OF MERCHANTABILITY,

		FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS DISCLAIMER OF WARRANTY CONSTITUTES AN 

		ESSENTIAL PART OF THIS AGREEMENT.



8.      GOVERNING LAW.  This Agreement will be governed by the laws of England without regard to conflict 

        of laws.



9.	    NO LIABILITY FOR CONSEQUENTIAL DAMAGES: IN NO EVENT SHALL UNIVERSITY OF YORK BE LIABLE TO USER FOR 

        ANY CONSEQUENTIAL, SPECIAL, INCIDENTAL OR INDIRECT DAMAGES OF ANY KIND ARISING OUT OF THE DELIVERY, 

		PERFORMANCE OR USE OF THE SOFTWARE, EVEN IF UNIVERSITY OF YORK HAS BEEN ADVISED OF THE POSSIBILITY 

		OF SUCH DAMAGES.  IN NO EVENT WILL UNIVERSITY OF YORK’S LIABILITY FOR ANY CLAIM, WHETHER IN CONTRACT, 

		TORT OR ANY OTHER THEORY OF LIABILITY, EXCEED THE AGREEMENT FEE PAID BY USER, IF ANY.

 ************************************************************************************************************/


/*
Code to calculate the similarity of two sets of peptides, obtained for example from denovo sequencing. Only matches of at least 4 amino acids are considered.
 
A list of files to be compared is required, say filelist.txt, and the code will output all pairwise similarity scores. The number of files to be compared should be given on the first line of filelist.txt followed by the name of each file.

If the code is compiled to produce an executable called peptidematch, then this can be run as:
./peptidematch filelist.txt distmat.txt

 An overall score for the similarity of each pair of files is written to screen and the output file distmat.txt contains the distance matrix for the set of files.
*/

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <ctype.h>
#define MAXPEPS 50000
#define MAXL 500

typedef struct pepinf{
  int *pepseq;
  int length;
  int aa;
}PEPINFO;

typedef struct twomatch{
  int p1;
  int p2;
  int overlap;
  int prestart;
  int score;
}MATCH;

typedef struct fn{
  char name[50];
}FILENAMES;

int getSeq(FILE *fp, int v[]);
double matchpeps(PEPINFO *pep1, int ii, PEPINFO *pep2, int jj);

int main (int argc, char *argv[]) 
{    
  int i, j, k, l, ind1, ind2, keepj;
  double score1, score2, best;
  double  totalscore, totalscore1, totalscore2;
  int tmp, dtmp; 
  FILE *fin = NULL;
  FILE *fout = NULL;
  FILE *fp1 = NULL;
  FILE *fp2 = NULL;
  char input1[50];
  char input2[50];
  
  PEPINFO *pep1 = NULL;
  pep1 = (PEPINFO*) malloc (MAXPEPS * sizeof(PEPINFO));
  for (i = 0; i < MAXPEPS; i++)
  {
    pep1[i].pepseq = (int*) malloc (MAXL * sizeof(int));
  }  
  PEPINFO *pep2 = NULL;
  pep2 = (PEPINFO*) malloc (MAXPEPS * sizeof(PEPINFO));
  for (i = 0; i < MAXPEPS; i++)
  {
    pep2[i].pepseq = (int*) malloc (MAXL * sizeof(int));
  }  

  /* number of peptides in each file */
  int npeps1, npeps2; 
  /* number of files */
  int numfiles;

  fin = fopen(argv[1], "r");
  tmp = fscanf(fin, "%d\n", &dtmp);
  numfiles = dtmp;    
  /* read list of filenames */
  FILENAMES *filelist = NULL;
  filelist = (FILENAMES*) malloc (numfiles * sizeof(FILENAMES));
  for (k = 0; k < numfiles; k++)
  {
    tmp = fscanf(fin, "%s\n", filelist[k].name);
     printf("%s\n", filelist[k].name);  
  }
  fclose(fin);
  
  /*distance matrix */
  double *distmat = NULL;
  distmat = (double*) malloc (numfiles*numfiles * sizeof(double));
  for (k = 0; k < numfiles; k++)
  {
    distmat[k+numfiles*k] = 0.0;
  }
  
  for (k = 0; k < numfiles; k++)
  {
    for (l = k+1; l < numfiles; l++)
    {
      strcpy(input1, filelist[k].name);
      strcpy(input2, filelist[l].name);

      fp1 = fopen(input1, "r");
      tmp = fscanf(fp1, "%d\n", &dtmp);
      npeps1 = dtmp;   
      printf("%s %d\n", input1, npeps1);  
      fp2 = fopen(input2, "r");
      tmp = fscanf(fp2, "%d\n", &dtmp);
      npeps2 = dtmp;     
      printf("%s %d\n", input2, npeps2);  

      for (i = 0; i < npeps1; i++)
      {
        pep1[i].length = getSeq(fp1, pep1[i].pepseq);
      }
      fclose(fp1);  
      for (i = 0; i < npeps2; i++)
      {
        pep2[i].length = getSeq(fp2, pep2[i].pepseq);
      }
      fclose(fp2);  
  
      score1 = 0.0;
      totalscore1 = 0.0;
      for (i = 0; i < npeps1; i++)
      {
        best = 0.0;
        keepj = -1;
        for (j = 0; j < npeps2; j++)
        {
          score1 = matchpeps(pep1, i, pep2, j); 
          if (score1 > best) 
          {
            best = score1;
            keepj = j;
          }
        }
        totalscore1 += best;
      } 
      totalscore1 /= (float)(npeps1);
      printf("%s to %s : %f\n", input2, input1, totalscore1);

        
      score2 = 0.0;
      totalscore2 = 0.0;
      for (i = 0; i < npeps2; i++)
      {
        best = 0.0;
        for (j = 0; j < npeps1; j++)
        {
          score2 = matchpeps(pep2, i, pep1, j); 
          if (score2 > best) 
          {
            best = score2;
            keepj = j;
          }
        }
        totalscore2 += best;
      } 
      totalscore2 /= (float)(npeps2);
      printf("%s to %s : %f\n", input1, input2, totalscore2);
    
      totalscore = (totalscore1 + totalscore2)/2.0;
      printf("%s to %s: protein similarity score: %f\n", filelist[k].name, filelist[l].name, totalscore);            
      ind1 = k + numfiles*l;
      ind2 = l + numfiles*k;
      distmat[ind1] = 1 - totalscore;
      distmat[ind2] = 1 - totalscore;        
    }
  }    
    
  /* write out distance matrix */
  fout = fopen(argv[2], "w");
  for (k = 0; k < numfiles; k++)
  {
    fprintf(fout, "%s ", filelist[k].name);  
  }
  fprintf(fout, "\n");
  for (l = 0; l < numfiles; l++)
  {
     fprintf(fout, "%s ", filelist[l].name);  
     for (k = 0; k < numfiles; k++)
     {
       ind1 = l + numfiles*k;
       fprintf(fout, "%f ", distmat[ind1]);
     }  
    fprintf(fout, "\n");
  }   
    
  free (distmat);
  for (i = 0; i < MAXPEPS; i++)
  {
    free(pep1[i].pepseq); 
  }  
  free (pep1);
  for (i = 0; i < MAXPEPS; i++)
  {
    free(pep2[i].pepseq); 
  }  
  free (pep2);   
  return 0;
}

/*****************************************************
Procedure: getSeq
Description: reads in sequence as a string
******************************************************/
int getSeq(FILE *fp, int v[])
{
	int i, ch;

    i = 0;
    ch = fgetc(fp);
	v[i] = ch-64;

	while ((ch != 13) && (ch != 10))
	{
	    i++;
	    ch = fgetc(fp);
	    v[i] = ch-64;
	}

	return i;	
}

/*****************************************************
Procedure: matchpeps
Description: gets the score for a peptide match
******************************************************/
double matchpeps(PEPINFO *pep1, int ii, PEPINFO *pep2, int jj)
{
  int score, prevscore, denom;
  double bestscore = 0.0;
  int n, i, k, j, l1, l2;
       
  l1 = pep1[ii].length;
  l2 = pep2[jj].length;
  MATCH *pairs = NULL;
  pairs = (MATCH*) malloc (l1*l2 * sizeof(MATCH));

  n = 0;
  for (k = 0; k < l1-4; k++)
  {
    for (j = 0; j < l2-4; j++)
    {
      /* only consider matches of at least 5 amino acids */
      if ((pep1[ii].pepseq[k] == pep2[jj].pepseq[j]) && (pep1[ii].pepseq[k+1] == pep2[jj].pepseq[j+1]) && (pep1[ii].pepseq[k+2] == pep2[jj].pepseq[j+2]) && (pep1[ii].pepseq[k+3] == pep2[jj].pepseq[j+3]) && (pep1[ii].pepseq[k+4] == pep2[jj].pepseq[j+4]))
      {
        pairs[n].p1 = k;
        pairs[n].p2 = j;
        pairs[n].overlap = l1 - pairs[n].p1;
        if (l2 - pairs[n].p2 < pairs[n].overlap) pairs[n].overlap = l2 - pairs[n].p2;         
        /* minimum start position */
        pairs[n].prestart = pairs[n].p1;
        if (pairs[n].p2 < pairs[n].prestart) pairs[n].prestart = pairs[n].p2;
        pairs[n].overlap += pairs[n].prestart;
        n++;
      }
    }
  }
  
  for (k = 0; k < n; k++)
  {
    /* only consider matches of at least 5 amino acids */
    if (pairs[k].overlap > 4)
    {
      score = 0;
      prevscore = 0;
      denom = pairs[k].overlap*(pairs[k].overlap+1)/2;
      for (i = 0; i < pairs[k].overlap; i++)
      {
        if (pep1[ii].pepseq[pairs[k].p1 - pairs[k].prestart + i] == pep2[jj].pepseq[pairs[k].p2 - pairs[k].prestart + i]) 
        {
          if (prevscore >= 0) prevscore += 1; 
          else prevscore = 1;
        } 
        else
        {
          if (prevscore < 0) prevscore += -1;
          else prevscore = -1;
        }
        score += prevscore;  
      }
      if ((float)score/(float)denom > bestscore) 
      {
        bestscore = (float)score/(float)denom;
      }
    }
  }

  free (pairs);
  return(bestscore);
}



