+ minor changes in the paths
[qpalma.git] / scripts / PipelineHeuristic.py
index b19eb63..f4050bc 100644 (file)
@@ -15,8 +15,6 @@ from qpalma.computeSpliceAlignWithQuality import *
 from numpy.matlib import mat,zeros,ones,inf
 from numpy import inf,mean
 
-#from qpalma.parsers import *
-
 from ParaParser import ParaParser,IN_VECTOR
 
 from qpalma.Lookup import LookupTable
@@ -91,8 +89,8 @@ class PipelineHeuristic:
       g_fmt = 'chr%d.dna.flat'
       s_fmt = 'contig_%d%s'
 
-      #self.chromo_range = range(1,6)
-      self.chromo_range = range(1,2)
+      self.chromo_range = range(1,6)
+      #self.chromo_range = range(1,2)
 
       num_chromo = 2
       accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
@@ -123,6 +121,8 @@ class PipelineHeuristic:
       self.qualityPlifs = qualityPlifs
 
       self.read_size    = 38
+      self.prb_offset   = 64
+      #self.prb_offset   = 50
 
       # parameters of the heuristics to decide whether the read is spliced
       #self.splice_thresh      = 0.005
@@ -257,12 +257,13 @@ class PipelineHeuristic:
          genomicSeq_stop   = pos+effective_len
 
          start = cpu()
-         currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
-         currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
+         #currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
+         currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop)
 
-         assert currentDNASeq_ == currentDNASeq
-         assert currentAcc_ == currentAcc_
-         assert currentDon_ == currentDon_
+         # just performed some tests to check LookupTable results
+         #assert currentDNASeq_ == currentDNASeq
+         #assert currentAcc_ == currentAcc_
+         #assert currentDon_ == currentDon_
 
          stop = cpu()
          self.get_time += stop-start
@@ -273,19 +274,19 @@ class PipelineHeuristic:
          exons[1,0]     = effective_len
          est            = unb_seq
          original_est   = seq
-         quality        = map(lambda x:ord(x)-64,prb)
+         quality        = map(lambda x:ord(x)-self.prb_offset,prb)
 
          currentVMatchAlignment = dna, exons, est, original_est, quality,\
          currentAcc, currentDon
 
          #pdb.set_trace()
 
-         alternativeAlignmentScores = self.calcAlternativeAlignments(location)
-         #try:
-         #   alternativeAlignmentScores = self.calcAlternativeAlignments(location)
-         #except:
-         #   alternativeAlignmentScores = []
+         #alternativeAlignmentScores = self.calcAlternativeAlignments(location)
 
+         try:
+            alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+         except:
+            alternativeAlignmentScores = []
 
          if alternativeAlignmentScores == []:
              # no alignment necessary
@@ -295,10 +296,11 @@ class PipelineHeuristic:
              maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
              # compute alignment for vmatch unspliced read
              vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
+             #print 'vMatchScore/alternativeScore: %f %f %s' % (vMatchScore,maxAlternativeAlignmentScore,strand)
+             
 
          start = cpu()
          
-         #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
          #print 'all candidates %s' % str(alternativeAlignmentScores)
 
          new_id = id - 1000000300000
@@ -370,7 +372,6 @@ class PipelineHeuristic:
       #distal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
       #distal_acc=distal_acc[-2:]
 
-
       proximal_don   = []
       for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
          if currentDon[idx] >= splice_thresh:
@@ -405,7 +406,7 @@ class PipelineHeuristic:
       strand   = location['strand']
       original_est = location['seq']
       quality      = location['prb']
-      quality        = map(lambda x:ord(x)-64,quality)
+      quality        = map(lambda x:ord(x)-self.prb_offset,quality)
       
       original_est = original_est.lower()
       est = unbracket_seq(original_est)
@@ -423,14 +424,16 @@ class PipelineHeuristic:
 
       start = cpu()
       #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
-      currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
+      currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop)
       stop = cpu()
       self.get_time += stop-start
       dna   = currentDNASeq
-   
+
       proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
        
       alternativeScores = []
+
+      #pdb.set_trace()
       
       # inlined
       h = self.h
@@ -519,6 +522,8 @@ class PipelineHeuristic:
               if acc_score>=self.splice_stop_thresh:
                   break
               
+      #pdb.set_trace()
+
       _stop = cpu()
       self.alternativeScoresTime += _stop-_start