+ refined creation of alternative alignments
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 14 Apr 2008 14:41:32 +0000 (14:41 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 14 Apr 2008 14:41:32 +0000 (14:41 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8519 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/PipelineHeuristic.py

index 5834634..6979d23 100644 (file)
@@ -78,7 +78,7 @@ class PipelineHeuristic:
 
       # when we look for alternative alignments with introns this value is the
       # mean of intron size
-      self.intron_size  = 250
+      self.intron_size  = 90
 
       self.read_size    = 36
    
@@ -92,6 +92,11 @@ class PipelineHeuristic:
       rrp = PipelineReadParser(self.data_fname)
       all_remapped_reads = rrp.parse()
 
+      ctr = 0
+      unspliced_ctr  = 0
+      spliced_ctr    = 0
+
+      print 'Starting filtering...'
 
       for readId,currentReadLocations in all_remapped_reads.items():
          for location in currentReadLocations:
@@ -106,6 +111,15 @@ class PipelineHeuristic:
             cal_prb  = location['cal_prb']
             chastity = location['chastity']
             is_spliced = location['is_spliced']
+
+            if strand == '-':
+               continue
+
+            if ctr == 100:
+               break
+
+            #if pos > 10000000:
+            #   continue
       
             unb_seq = unbracket_est(seq)
             effective_len = len(unb_seq)
@@ -113,6 +127,7 @@ class PipelineHeuristic:
             genomicSeq_start  = pos
             genomicSeq_stop   = pos+effective_len-1
 
+            #print genomicSeq_start,genomicSeq_stop
             currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
 
             dna            = currentDNASeq
@@ -123,52 +138,50 @@ class PipelineHeuristic:
             original_est   = seq
             quality        = prb
 
-            pdb.set_trace()
+            #pdb.set_trace()
 
             currentVMatchAlignment = dna, exons, est, original_est, quality,\
             currentAcc, currentDon
             vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
 
-            print 'vMatchScore is %f' % vMatchScore
-
+            alternativeAlignmentScores = self.calcAlternativeAlignments(location)
 
-            alternativeAlignmentScore = self.calcAlternativeAlignments(location)
-
-            print 'alternative score is %f' % alternativeAlignmentScore 
-
-            """
-            maxScore    = 0.0
-            maxAlignment = None
-            for currentAlternative in alternativeAlignments:
-               if currentScore > maxScore:
-                  maxScore = alternativeScores 
-                  maxAlignment = currentAlternative
-
-            currentScore = calcAlignmentScore(currentAlternative)
+            # found no alternatives
+            if alternativeAlignmentScores == []:
+               continue
+            
+            maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
+            #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
+            #print 'all candidates %s' % str(alternativeAlignmentScores)
 
             # Seems that according to our learned parameters VMatch found a good
             # alignment of the current read
-            if maxScore < vMatchScore:
-               pass
+            if maxAlternativeAlignmentScore < vMatchScore:
+               unspliced_ctr += 1
             # We found an alternative alignment considering splice sites that scores
             # higher than the VMatch alignment
             else:
-               pass
-            """
+               spliced_ctr += 1
+
+            ctr += 1
+
+      print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
+
 
    def findHighestScoringSpliceSites(self,currentAcc,currentDon):
-      pdb.set_trace()
       max_don  = -inf
-      don_pos  = 0
+      don_pos  = []
       for idx,score in enumerate(currentDon):
-         if score > -inf and idx < (len(currentDon)*0.5):
-            don_pos = idx
+         if score > -inf and idx > 1 and idx < self.read_size:
+            don_pos.append(idx)
+
+         if len(don_pos) == 2:
             break
 
       max_acc  = -inf
-      acc_pos  = 0
+      acc_pos  = []
       for idx,score in enumerate(currentAcc):
-         if score > -inf and idx > don_pos:
+         if score > -inf and idx > self.intron_size:
             acc_pos = idx
             break
 
@@ -187,6 +200,7 @@ class PipelineHeuristic:
       pos      = location['pos']
       strand   = location['strand']
       seq      = location['seq']
+      orig_seq = location['orig_seq']
       prb      = location['prb']
       cal_prb  = location['cal_prb']
       is_spliced = location['is_spliced']
@@ -195,39 +209,42 @@ class PipelineHeuristic:
       effective_len = len(unb_seq)
 
       genomicSeq_start  = pos
-      genomicSeq_stop   = pos+self.intron_size+self.read_size*2
+      genomicSeq_stop   = pos+self.intron_size*2+self.read_size*2
 
       currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+      dna            = currentDNASeq
 
-      don_pos,acc_pos = self.findHighestScoringSpliceSites(currentAcc,currentDon)
+      alt_don_pos,acc_pos = self.findHighestScoringSpliceSites(currentAcc,currentDon)
 
-      print don_pos,acc_pos
+      #print alt_don_pos,acc_pos
 
-      """
-      donor_elem = dna[exons[0,1]:exons[0,1]+2]
-      acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
-      """
+      alternativeScores = []
 
-      dna            = currentDNASeq
-      exons          = zeros((2,2),dtype=numpy.int)
-      exons[0,0]     = 0
-      exons[0,1]     = don_pos
-      exons[1,0]     = acc_pos+1
-      exons[1,1]     = acc_pos+1+(self.read_size-don_pos)
-      est            = unb_seq
-      original_est   = seq
-      quality        = prb
+      for don_pos in alt_don_pos:
+         exons          = zeros((2,2),dtype=numpy.int)
+         exons[0,0]     = 0
+         exons[0,1]     = don_pos
+         exons[1,0]     = acc_pos+1
+         exons[1,1]     = acc_pos+1+(self.read_size-don_pos)
+         est            = unb_seq
+         original_est   = seq
+         quality        = prb
+
+         _dna = dna[:int(exons[1,1])]
+         _dna = _dna[:exons[1,0]] + orig_seq[don_pos:]
+
+         #pdb.set_trace()
 
-      dna = dna[:int(exons[1,1])]
-      currentAcc = currentAcc[:int(exons[1,1])]
-      currentDon = currentDon[:int(exons[1,1])]
+         _currentAcc = currentAcc[:int(exons[1,1])]
+         _currentDon = currentDon[:int(exons[1,1])]
 
-      currentVMatchAlignment = dna, exons, est, original_est, quality,\
-      currentAcc, currentDon
+         currentVMatchAlignment = _dna, exons, est, original_est, quality,\
+         _currentAcc, _currentDon
 
-      alternativeScore = self.calcAlignmentScore(currentVMatchAlignment)
+         #alternativeScore = self.calcAlignmentScore(currentVMatchAlignment)
+         alternativeScores.append(self.calcAlignmentScore(currentVMatchAlignment))
 
-      return alternativeScore
+      return alternativeScores
 
 
    def calcAlignmentScore(self,alignment):
@@ -288,7 +305,8 @@ if __name__ == '__main__':
    jp = os.path.join
 
    run_fname      = jp(dir,'run_object.pickle')
-   data_fname     = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_flag'
+   #data_fname     = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_flag'
+   data_fname     = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_spliced_flag'
 
    param_fname = jp(dir,'param_500.pickle')