+ finished first running version of the spliced/unspliced heuristic
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 14 Apr 2008 12:28:38 +0000 (12:28 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 14 Apr 2008 12:28:38 +0000 (12:28 +0000)
+ removed dead code from the data generation script
TODO
- generation of alternative alignments in the heuristic may not be optimal

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8516 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/PipelineHeuristic.py
scripts/compile_dataset.py
scripts/createNewDataset.py
scripts/qpalma_main.py

index 1afcff7..5834634 100644 (file)
@@ -9,6 +9,44 @@ import os
 import os.path
 import math
 
+from qpalma.DataProc import *
+from qpalma.computeSpliceWeights import *
+from qpalma.set_param_palma import *
+from qpalma.computeSpliceAlignWithQuality import *
+from qpalma.penalty_lookup_new import *
+from qpalma.compute_donacc import *
+from qpalma.TrainingParam import Param
+from qpalma.Plif import Plf
+
+from qpalma.tools.splicesites import getDonAccScores
+from qpalma.Configuration import *
+
+from compile_dataset import getSpliceScores, get_seq_and_scores
+
+
+from numpy.matlib import mat,zeros,ones,inf
+from numpy import inf
+
+from qpalma.parsers import PipelineReadParser
+
+
+def unbracket_est(est):
+   new_est = ''
+   e = 0
+
+   while True:
+      if e >= len(est):
+         break
+
+      if est[e] == '[':
+         new_est += est[e+2]
+         e += 4
+      else:
+         new_est += est[e]
+         e += 1
+
+   return "".join(new_est).lower()
+
 
 class PipelineHeuristic:
    """
@@ -16,62 +54,208 @@ class PipelineHeuristic:
    vmatch is spliced an should be then newly aligned using QPalma or not.
    """
 
-   def __init__(self,filename):
-      self.data_filename = filename
+   def __init__(self,run_fname,data_fname,param_fname):
+      """
+      We need a run object holding information about the nr. of support points
+      etc.
+      """
 
+      run = cPickle.load(open(run_fname))
+      self.run = run
 
-   def filter(self):
-      SeqInfo, Exons, OriginalEsts, Qualities,\
-      AlternativeSequences = paths_load_data(data_filename,'training',None,self.ARGS)
+      self.data_fname = data_fname
+
+      self.param = cPickle.load(open(param_fname))
+      
+      # Set the parameters such as limits penalties for the Plifs
+      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(self.param,True,run)
+
+      self.h = h
+      self.d = d
+      self.a = a
+      self.mmatrix = mmatrix
+      self.qualityPlifs = qualityPlifs
+
+      # when we look for alternative alignments with introns this value is the
+      # mean of intron size
+      self.intron_size  = 250
+
+      self.read_size    = 36
    
-      for idx in range(len(SeqInfo)):
-         currentVMatchAlignment
-         vMatchScore = calcAlignmentScore(currentVMatchAlignment)
 
-         alternativeAlignments = calcAlternativeAlignments(currentVMatchAlignment)
+   def filter(self):
+      """
+      This method...
+      """
+      run = self.run 
+
+      rrp = PipelineReadParser(self.data_fname)
+      all_remapped_reads = rrp.parse()
+
+
+      for readId,currentReadLocations in all_remapped_reads.items():
+         for location in currentReadLocations:
+            chr      = location['chr']
+            pos      = location['pos']
+            strand   = location['strand']
+            mismatch = location['mismatches']
+            length   = location['length']
+            off      = location['offset']
+            seq      = location['seq']
+            prb      = location['prb']
+            cal_prb  = location['cal_prb']
+            chastity = location['chastity']
+            is_spliced = location['is_spliced']
+      
+            unb_seq = unbracket_est(seq)
+            effective_len = len(unb_seq)
+
+            genomicSeq_start  = pos
+            genomicSeq_stop   = pos+effective_len-1
+
+            currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+
+            dna            = currentDNASeq
+            exons          = zeros((2,1))
+            exons[0,0]     = 0
+            exons[1,0]     = effective_len
+            est            = unb_seq
+            original_est   = seq
+            quality        = prb
 
-         maxScore    = 0.0
-         maxAlignment = None
-         for currentAlternative in alternativeAlignments:
-            if currentScore > maxScore:
-               maxScore = alternativeScores 
-               maxAlignment = currentAlternative
+            pdb.set_trace()
 
-            currentScore = calcAlignmentScore(currentAlternative))
+            currentVMatchAlignment = dna, exons, est, original_est, quality,\
+            currentAcc, currentDon
+            vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
 
-         # Seems that according to our learned parameters VMatch found a good
-         # alignment of the current read
-         if maxScore < vMatchScore:
-            pass
-         # We found an alternative alignment considering splice sites that scores
-         # higher than the VMatch alignment
-         else:
-            pass
-            
+            print 'vMatchScore is %f' % vMatchScore
 
 
-   def calcAlternativeAlignments(self,currentVMatchAlignment):
+            alternativeAlignmentScore = self.calcAlternativeAlignments(location)
+
+            print 'alternative score is %f' % alternativeAlignmentScore 
+
+            """
+            maxScore    = 0.0
+            maxAlignment = None
+            for currentAlternative in alternativeAlignments:
+               if currentScore > maxScore:
+                  maxScore = alternativeScores 
+                  maxAlignment = currentAlternative
+
+            currentScore = calcAlignmentScore(currentAlternative)
+
+            # Seems that according to our learned parameters VMatch found a good
+            # alignment of the current read
+            if maxScore < vMatchScore:
+               pass
+            # We found an alternative alignment considering splice sites that scores
+            # higher than the VMatch alignment
+            else:
+               pass
+            """
+
+   def findHighestScoringSpliceSites(self,currentAcc,currentDon):
+      pdb.set_trace()
+      max_don  = -inf
+      don_pos  = 0
+      for idx,score in enumerate(currentDon):
+         if score > -inf and idx < (len(currentDon)*0.5):
+            don_pos = idx
+            break
+
+      max_acc  = -inf
+      acc_pos  = 0
+      for idx,score in enumerate(currentAcc):
+         if score > -inf and idx > don_pos:
+            acc_pos = idx
+            break
+
+      return don_pos,acc_pos
+
+
+   def calcAlternativeAlignments(self,location):
       """
       Given an alignment proposed by Vmatch this function calculates possible
       alternative alignments taking into account for example matched
       donor/acceptor positions.
       """
+      run = self.run
+
+      chr      = location['chr']
+      pos      = location['pos']
+      strand   = location['strand']
+      seq      = location['seq']
+      prb      = location['prb']
+      cal_prb  = location['cal_prb']
+      is_spliced = location['is_spliced']
       
-      currentVMatchAlignment
+      unb_seq = unbracket_est(seq)
+      effective_len = len(unb_seq)
 
+      genomicSeq_start  = pos
+      genomicSeq_stop   = pos+self.intron_size+self.read_size*2
 
-      return alternativeAlignments 
+      currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
 
+      don_pos,acc_pos = self.findHighestScoringSpliceSites(currentAcc,currentDon)
 
-   def calcAlignmentScore(self,dna,exons,est):
+      print don_pos,acc_pos
+
+      """
+      donor_elem = dna[exons[0,1]:exons[0,1]+2]
+      acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
+      """
+
+      dna            = currentDNASeq
+      exons          = zeros((2,2),dtype=numpy.int)
+      exons[0,0]     = 0
+      exons[0,1]     = don_pos
+      exons[1,0]     = acc_pos+1
+      exons[1,1]     = acc_pos+1+(self.read_size-don_pos)
+      est            = unb_seq
+      original_est   = seq
+      quality        = prb
+
+      dna = dna[:int(exons[1,1])]
+      currentAcc = currentAcc[:int(exons[1,1])]
+      currentDon = currentDon[:int(exons[1,1])]
+
+      currentVMatchAlignment = dna, exons, est, original_est, quality,\
+      currentAcc, currentDon
+
+      alternativeScore = self.calcAlignmentScore(currentVMatchAlignment)
+
+      return alternativeScore
+
+
+   def calcAlignmentScore(self,alignment):
       """
       Given an alignment (dna,exons,est) and the current parameter for QPalma
       this function calculates the dot product of the feature representation of
       the alignment and the parameter vector i.e the alignment score. 
       """
+      run = self.run
+
+      h = self.h
+      d = self.d
+      a = self.a
+      mmatrix = self.mmatrix
+      qualityPlifs = self.qualityPlifs
+
+      lengthSP    = run['numLengthSuppPoints']
+      donSP       = run['numDonSuppPoints']
+      accSP       = run['numAccSuppPoints']
+      mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
+      numq        = run['numQualSuppPoints']
+      totalQualSP = run['totalQualSuppPoints']
 
       currentPhi = zeros((run['numFeatures'],1))
 
+      # Lets start calculation
+      dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
+
       # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
       trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
       computeSpliceAlignWithQuality(dna, exons, est, original_est,\
@@ -80,6 +264,7 @@ class PipelineHeuristic:
       # Calculate the weights
       trueWeightDon, trueWeightAcc, trueWeightIntron =\
       computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+
       trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
 
       currentPhi[0:lengthSP]                                            = mat(h.penalties[:]).reshape(lengthSP,1)
@@ -87,17 +272,25 @@ class PipelineHeuristic:
       currentPhi[lengthSP+donSP:lengthSP+donSP+accSP]                   = mat(a.penalties[:]).reshape(accSP,1)
       currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP]   = mmatrix[:]
 
-      if run['mode'] == 'using_quality_scores':
-         totalQualityPenalties = param[-totalQualSP:]
-         currentPhi[lengthSP+donSP+accSP+mmatrixSP:]                    = totalQualityPenalties[:]
+      totalQualityPenalties = self.param[-totalQualSP:]
+      currentPhi[lengthSP+donSP+accSP+mmatrixSP:]                    = totalQualityPenalties[:]
 
       # Calculate w'phi(x,y) the total score of the alignment
       return (trueWeight.T * currentPhi)[0,0]
 
 
-
 if __name__ == '__main__':
-   filename = sys.argv[1]
-   out_filename = sys.argv[2]
-   ph1 = PipelineHeuristic(filename)
+   #run_fname = sys.argv[1]
+   #data_fname = sys.argv[2]
+   #param_filename = sys.argv[3]
+
+   dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test/run_+_quality_+_splicesignals_+_intron_len'
+   jp = os.path.join
+
+   run_fname      = jp(dir,'run_object.pickle')
+   data_fname     = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_flag'
+
+   param_fname = jp(dir,'param_500.pickle')
+
+   ph1 = PipelineHeuristic(run_fname,data_fname,param_fname)
    ph1.filter()
index 632652f..79a2b98 100644 (file)
@@ -345,42 +345,6 @@ def process_map(reReads,fRead,dna_flat_files):
 
       CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
    
-      
-      """
-      # vmatch found the correct position
-      if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start'] <= pos <= fRead['p_stop']:
-         currentLabel = True
-
-      # first half of the read matches somewhere
-      # search downstream for the rest
-      if offset == 0:
-         genomicSeq_start = pos
-         genomicSeq_stop = pos + fragment_size
-         alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
-         if currentLabel:
-            pdb.set_trace()
-         continue
-
-      # second half of the read matches somewhere
-      # search upstream for the rest
-      elif offset != 0 and length + offset == Conf.read_size:
-         genomicSeq_start = pos - fragment_size
-         genomicSeq_stop = pos+length
-         alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
-         if currentLabel:
-            pdb.set_trace()
-         continue
-
-      # middle part of the read matches somewhere
-      # search up/downstream for the rest
-      else:
-         genomicSeq_start = pos - (fragment_size/2)
-         genomicSeq_stop = pos + (fragment_size/2)
-         alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
-         continue
-      """
-
-
       # vmatch found the correct position
       if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
          currentLabel = True
index 9782997..917b107 100644 (file)
@@ -4,18 +4,11 @@
 import qpalma.Configuration as Conf
 from compile_dataset import compile_d
 
-#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/filteredReads_11_02_2008_subset'
-#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/halleluja.txt_1000'
 
-#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/filteredReads_11_02_2008'
-#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/filteredReads_18_03_2008'
-#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_18_03_2008'
+#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allFilteredReads_20_03_2008'
+#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_05_04_2008_tmp'
 
-filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allFilteredReads_20_03_2008'
+filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/unfilteredReads_10_04'
+remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_unspliced'
 
-remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_05_04_2008_tmp'
-
-#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_1k'
-#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/halleluja.txt'
-
-compile_d(Conf.gff_fn,Conf.dna_flat_fn,filtered_fn,remapped_fn,Conf.tmp_dir,'dataset_remapped',True)
+compile_d(Conf.gff_fn,Conf.dna_flat_fn,filtered_fn,remapped_fn,Conf.tmp_dir,'dataset_remapped_unspliced',True)
index 1112c2f..60a86df 100644 (file)
@@ -280,7 +280,7 @@ class QPalma:
       data_filename = self.run['dataset_filename']
 
       SeqInfo, Exons, OriginalEsts, Qualities,\
-      AlternativeSequences = paths_load_data(data_filename,'training',None,self.ARGS)
+      AlternativeSequences = paths_load_data(data_filename)
 
       # Load the whole dataset 
       if self.run['mode'] == 'normal':
@@ -663,7 +663,8 @@ class QPalma:
       data_filename = self.run['dataset_filename']
 
       SeqInfo, Exons, OriginalEsts, Qualities,\
-      AlternativeSequences = paths_load_data(data_filename,'training',None,self.ARGS)
+      #AlternativeSequences = paths_load_data(data_filename,'training',None,self.ARGS)
+      AlternativeSequences = paths_load_data(data_filename)
 
       self.SeqInfo     = SeqInfo
       self.Exons       = Exons