+ got rid of some legacy code
[qpalma.git] / qpalma / computeSpliceAlignWithQuality.py
index 1ec72c9..e498098 100644 (file)
@@ -1,14 +1,19 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
 
 import numpy
 from numpy.matlib import mat,zeros,ones,inf
 import pdb
 from Plif import Plf,base_coord,linspace
-import Configuration as Conf
 
-def  computeSpliceAlignWithQuality(dna, exons, est=None, quality=None,
-qualityPlifs=None):
+import sequence_utils
+
+def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs, run):
    """
    Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
    Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
@@ -19,91 +24,116 @@ qualityPlifs=None):
    cccccXXcccccA1cccccE1 ... Em-1cccXXcccccccc
    """
 
-   numberOfExons = (exons.shape)[0] # how many rows ?
-   exonSizes = [-1]*numberOfExons
+   SpliceAlign = []
 
-   #assert numberOfExons == 3
+   if exons.shape == (2,2):
+      numberOfExons = 2
+      exonSizes = [-1]*numberOfExons
 
-   for idx in range(numberOfExons):
-      exonSizes[idx] = exons[idx,1] - exons[idx,0]
+      exonSizes[0] = exons[0,1] - exons[0,0]
+      exonSizes[1] = exons[1,1] - exons[1,0]
 
-   # SpliceAlign vector describes alignment: 
-   # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
-   SpliceAlign = []
+      # SpliceAlign vector describes alignment: 
+      # 1:donorpos, 3:intron 2:acceptorpos, 0:exon, 4: dangling end
 
-   if exons[0,0] > 0:
-      SpliceAlign.extend([4]*(exons[0,0]))
+      if exons[0,0] > 0:
+         SpliceAlign.extend([4]*(exons[0,0]))
 
-   for idx in range(numberOfExons):
-      exonLength = exonSizes[idx]
-      SpliceAlign.extend([0]*exonLength)
-      
-      if idx < numberOfExons-1:
-         intronLength = exons[idx+1,0] - exons[idx,1]
-         SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
+      for idx in range(numberOfExons):
+         exonLength = exonSizes[idx]
+         SpliceAlign.extend([0]*exonLength)
+         
+         if idx < numberOfExons-1:
+            intronLength = exons[idx+1,0] - exons[idx,1]
+            SpliceAlign.extend([1]+[3]*(intronLength-2)+[2])
 
-   if len(dna) > exons[-1,1]:
-      SpliceAlign.extend([4]*(len(dna)-exons[-1,1]))
+      if len(dna) > exons[-1,1]:
+         SpliceAlign.extend([4]*(len(dna)-exons[-1,1]))
 
-   assert len(SpliceAlign) == len(dna), pdb.set_trace()
 
-   # number of matches: just have to look at the underlying est
-   # est = dna(find(SpliceAlign==0)) # exon nucleotides
-   exonPos = [pos for pos,elem in enumerate(SpliceAlign) if elem == 0]
+   elif exons.shape == (2,1):
+      exonSize = exons[1,0] - exons[0,0]
+
+      if exons[0,0] > 0:
+         SpliceAlign.extend([4]*(exons[0,0]))
 
-   #est = [elem for pos,elem in enumerate(dna) if pos in exonPos]
+      SpliceAlign.extend([0]*(exonSize))
 
-   #length_est = sum(exon_sizes) ;
-   totalESTLength = 0
-   for elem in exonSizes:
-      totalESTLength += elem
+      if len(dna) > exons[1,0]:
+         SpliceAlign.extend([4]*(len(dna)-exons[1,0]))
 
-   assert totalESTLength == len(est) 
+   else:
+      pass
 
-   sizeMatchmatrix = Conf.sizeMatchmatrix
+   assert len(SpliceAlign) == len(dna), pdb.set_trace()
+
+   #  
+   # start of label feature generation
+   #  
+
+   sizeMatchmatrix = run['matchmatrixRows']*run['matchmatrixCols'] #
    # counts the occurences of a,c,g,t,n in this order
-   numChar = [0]*sizeMatchmatrix[0]
 
    # counts the occurrences of a,c,g,t,n with their quality scores
    trueWeightQualityMat = [0.0]*5 # 5 rows
    for row in range(5):
       trueWeightQualityMat[row] = [0.0]*6 # 6 col per row
 
-   for elem in est:
-      if elem == 'a':
-         numChar[0] += 1
-      if elem == 'c':
-         numChar[1] += 1
-      if elem == 'g':
-         numChar[2] += 1
-      if elem == 't':
-         numChar[3] += 1
-      if elem == 'n':
-         numChar[4] += 1
-
    for row in range(5):
       for col in range(6):
-         trueWeightQualityMat[row][col] = [0.0]*Conf.numQualSuppPoints # supp. points per plif
+         trueWeightQualityMat[row][col] = [0.0]*run['numQualSuppPoints'] # supp. points per plif
 
-   first_exon  = dna[exons[0,0]:exons[0,1]]
-   second_exon = dna[exons[1,0]:exons[1,1]]
-   dna_part = first_exon + second_exon
+   dna_part = []
+   est_part = []
 
-   assert len(dna_part) == len(est)
-   assert len(est) == sum(numChar)
-   
-   #print numChar
-   #print est
+   orig_pos = 0
+   while True:
+      if not orig_pos < len(original_est):
+         break
+
+      orig_char = original_est[orig_pos]
+
+      if orig_char == '[':
+         dna_part.append( original_est[orig_pos+1] )
+         est_part.append( original_est[orig_pos+2] )
+         orig_pos += 4
+      else:
+         dna_part.append( orig_char )
+         est_part.append( orig_char )
+         orig_pos += 1
+
+   for orig_char in dna_part:
+      assert orig_char in sequence_utils.alphabet, pdb.set_trace()
+
+   for orig_char in est_part:
+      assert orig_char in sequence_utils.alphabet, pdb.set_trace()
+
+   trueWeightMatch = zeros((run['matchmatrixRows']*run['matchmatrixCols'],1)) # Scorematrix fuer Wahrheit
 
    map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
 
-   if Conf.mode == 'using_quality_scores':
+   ctr = 0
+   quality_part = []
+   for elem in est_part:
+      if elem == '-':
+         quality_part.append(-inf)
+      else:
+         quality_part.append(quality[ctr])
+         ctr += 1
+
+   assert len(est_part) == len(quality_part)
+
+   for idx in range(len(dna_part)):
+      dnanum = map[dna_part[idx]]
+      estnum = map[est_part[idx]]
+      value = quality_part[idx]
 
-      for idx in range(len(dna_part)):
-         dnanum = map[dna_part[idx]]
-         estnum = map[est[idx]]
-         value = quality[idx]
+      if estnum == 0:
+         trueWeightMatch[dnanum] += 1.0
+         assert value == -inf
 
+      else:
+         assert value != -inf
          cur_plf = qualityPlifs[(estnum-1)*6+dnanum]
 
          Lower = len([elem for elem in cur_plf.limits if elem <= value])
@@ -123,36 +153,109 @@ qualityPlifs=None):
             trueWeightQualityMat[estnum-1][dnanum][Upper] += weightup
             trueWeightQualityMat[estnum-1][dnanum][Lower] += weightlow
 
-      trueWeightQuality = zeros((Conf.numQualSuppPoints*Conf.numQualPlifs,1))
+      trueWeightQuality = zeros((run['numQualSuppPoints']*run['numQualPlifs'],1))
 
       ctr = 0
       for row in range(5):
          for col in range(6):
-            for p_idx in range(Conf.numQualSuppPoints):
+            for p_idx in range(run['numQualSuppPoints']):
                #print ctr, row, col, p_idx
                trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
                ctr += 1
 
-   #assert int(sum(trueWeightQuality.flatten().tolist()[0])) == len(est)
+   #pdb.set_trace()
+   featureSum = 0
+   featureSum += sum([e for p,e in enumerate(trueWeightQuality.flatten().tolist()[0]) if e != 0.0])
+   featureSum += sum(trueWeightMatch.flatten().tolist()[0])
 
-   totalNumChar = 0
-   for idx in range(sizeMatchmatrix[0]):
-      totalNumChar += numChar[idx]
+   #assert int(featureSum) == len(est_part),pdb.set_trace()
+   # 'feature sum is not equal read size!!'
 
-   assert totalNumChar == len(est), "numChar %d, len(est) %d" % (totalNumChar,len(est))
+   if exons.shape == (2,2):
+      dna_orig = dna[exons[0,0]:exons[0,1]]
+      dna_orig += dna[exons[1,0]:exons[1,1]]
+   elif exons.shape == (2,1):
+      #dna_orig = dna[exons[0,0]:exons[1,0]]
+      pass
+   else:
+      pass
 
-   # writing in weight match matrix
-   # matrix is saved columnwise
-   if Conf.mode == 'normal':
-      trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
-      for idx in range(1,sizeMatchmatrix[0]):
-         trueWeightMatch[idx,idx] = numChar[idx-1]
+   dna_calc = "".join(dna_part)
 
-   if Conf.mode == 'using_quality_scores':
-      trueWeightMatch = zeros((sizeMatchmatrix[0],sizeMatchmatrix[1])) # Scorematrix fuer Wahrheit
-      #for idx in range(1,sizeMatchmatrix[0]):
-      #   trueWeightMatch[idx] = numChar[idx-1]
+   #assert dna_orig == dna_calc, pdb.set_trace()
 
-   trueWeightMatch = trueWeightMatch.reshape(sizeMatchmatrix[0]*sizeMatchmatrix[1],1)
+   return SpliceAlign, trueWeightMatch, trueWeightQuality, dna_calc
 
-   return SpliceAlign, trueWeightMatch, trueWeightQuality
+
+def  computeSpliceAlignScoreWithQuality(original_est, quality, qualityPlifs, run, weightvector):
+   """
+   """
+
+   lengthSP    = run['numLengthSuppPoints']
+   donSP       = run['numDonSuppPoints']
+   accSP       = run['numAccSuppPoints']
+   mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
+   numq        = run['numQualSuppPoints']*5*6
+
+   lengthP = weightvector[0:lengthSP]
+   donP = weightvector[lengthSP:lengthSP+donSP]
+   accP = weightvector[donSP+lengthSP:donSP+lengthSP+accSP]
+   mmatrixP = weightvector[accSP+donSP+lengthSP:accSP+donSP+lengthSP+mmatrixSP]
+   numqP = weightvector[accSP+donSP+lengthSP+mmatrixSP:accSP+donSP+lengthSP+mmatrixSP+numq]
+   numQualSuppPoints=run['numQualSuppPoints']
+   
+   map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5, '[': 10, ']':11 }
+   original_est_mapped = [ map[e] for e in original_est ]
+
+   if True:
+       score=0.0
+       est_ptr=0
+       dna_ptr=0
+       ptr=0 
+       while ptr<len(original_est_mapped):
+           
+           if original_est_mapped[ptr]==10: #'[':
+               dnaletter=original_est_mapped[ptr+1]
+               estletter=original_est_mapped[ptr+2]
+               ptr+=4 
+           else:
+               dnaletter=original_est_mapped[ptr]
+               estletter=dnaletter
+               ptr+=1
+
+           if estletter==0:# '-':
+               score+= mmatrixP[dna_letter]
+           else:
+               value = quality[est_ptr]
+               cur_plf = qualityPlifs[(estletter-1)*6+dnaletter]
+               
+               Lower = 0
+               for elem in cur_plf.limits:
+                   if elem<=value:
+                       Lower+=1 
+               
+               if Lower == 0:
+                   score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+0] ;
+                   
+               elif Lower == len(cur_plf.limits):
+                   score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints + numQualSuppPoints-1] ;
+
+               else:
+                   # because we count from 0 in python
+                   Lower -= 1
+                   Upper = Lower+1 ; # x-werte bleiben fest
+                   weightup  = 1.0*(value - cur_plf.limits[Lower]) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
+                   weightlow = 1.0*(cur_plf.limits[Upper] - value) / (cur_plf.limits[Upper] - cur_plf.limits[Lower])
+                   
+                   score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+Upper]*weightup ;
+                   score += numqP[(estletter-1)*6*numQualSuppPoints + dnaletter*numQualSuppPoints+Lower]*weightlow ;
+                                  
+           if estletter==0:# '-':
+               dna_ptr+=1 
+           elif dnaletter==0:#'-':
+               est_ptr+=1
+           else:
+               dna_ptr+=1 
+               est_ptr+=1
+               
+   return score