+ fixed bug in reads stemming from negative strand
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 21 Aug 2008 11:30:12 +0000 (11:30 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 21 Aug 2008 11:30:12 +0000 (11:30 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@10391 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/PipelineHeuristic.py

index e6e3a39..b19eb63 100644 (file)
@@ -15,18 +15,20 @@ from qpalma.computeSpliceAlignWithQuality import *
 from numpy.matlib import mat,zeros,ones,inf
 from numpy import inf,mean
 
-from qpalma.parsers import *
+#from qpalma.parsers import *
 
-from ParaParser import *
+from ParaParser import ParaParser,IN_VECTOR
 
 from qpalma.Lookup import LookupTable
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement,unbracket_seq
 
-from qpalma.sequence_utils import reverse_complement,unbracket_seq
+
+def cpu():
+   return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
+   resource.getrusage(resource.RUSAGE_SELF).ru_stime) 
 
 
 class BracketWrapper:
-   #fields = ['id', 'chr', 'pos', 'strand', 'mismatches', 'length',\
-   #'offset', 'seq', 'prb', 'cal_prb', 'chastity']
    fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
 
    def __init__(self,filename):
@@ -68,8 +70,6 @@ class PipelineHeuristic:
       self.run = run
 
       start = cpu()
-      # old version
-      #self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
       self.all_remapped_reads = BracketWrapper(data_fname)
       stop = cpu()
 
@@ -91,7 +91,12 @@ class PipelineHeuristic:
       g_fmt = 'chr%d.dna.flat'
       s_fmt = 'contig_%d%s'
 
-      self.chromo_range = range(1,6)
+      #self.chromo_range = range(1,6)
+      self.chromo_range = range(1,2)
+
+      num_chromo = 2
+      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+      self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
 
       start = cpu()
       self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range)
@@ -204,7 +209,6 @@ class PipelineHeuristic:
       """
       run = self.run 
 
-
       ctr = 0
       unspliced_ctr  = 0
       spliced_ctr    = 0
@@ -224,13 +228,8 @@ class PipelineHeuristic:
          chr      = location['chr']
          pos      = location['pos']
          strand   = location['strand']
-         #mismatch = location['mismatches']
-         #length   = location['length']
-         #off      = location['offset']
          seq      = location['seq']
          prb      = location['prb']
-         #cal_prb  = location['cal_prb']
-         #chastity = location['chastity']
 
          id = int(id)
 
@@ -249,6 +248,7 @@ class PipelineHeuristic:
          if strand == '-':
             #pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
             unb_seq = reverse_complement(unb_seq)
+            seq = reverse_complement(seq)
 
          effective_len = len(unb_seq)
 
@@ -257,8 +257,12 @@ class PipelineHeuristic:
          genomicSeq_stop   = pos+effective_len
 
          start = cpu()
-         #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
-         currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+         currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
+         currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
+
+         assert currentDNASeq_ == currentDNASeq
+         assert currentAcc_ == currentAcc_
+         assert currentDon_ == currentDon_
 
          stop = cpu()
          self.get_time += stop-start
@@ -271,15 +275,16 @@ class PipelineHeuristic:
          original_est   = seq
          quality        = map(lambda x:ord(x)-64,prb)
 
-         #pdb.set_trace()
-
          currentVMatchAlignment = dna, exons, est, original_est, quality,\
          currentAcc, currentDon
 
-         try:
-            alternativeAlignmentScores = self.calcAlternativeAlignments(location)
-         except:
-            alternativeAlignmentScores = []
+         #pdb.set_trace()
+
+         alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+         #try:
+         #   alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+         #except:
+         #   alternativeAlignmentScores = []
 
 
          if alternativeAlignmentScores == []:
@@ -384,6 +389,7 @@ class PipelineHeuristic:
 
       return proximal_acc,proximal_don,distal_acc,distal_don
 
+
    def calcAlternativeAlignments(self,location):
       """
       Given an alignment proposed by Vmatch this function calculates possible
@@ -400,7 +406,6 @@ class PipelineHeuristic:
       original_est = location['seq']
       quality      = location['prb']
       quality        = map(lambda x:ord(x)-64,quality)
-      #cal_prb  = location['cal_prb']
       
       original_est = original_est.lower()
       est = unbracket_seq(original_est)
@@ -412,6 +417,10 @@ class PipelineHeuristic:
       strand_map = {'D':'+', 'P':'-'}
       strand = strand_map[strand]
 
+      if strand == '-':
+         est = reverse_complement(est)
+         original_est = reverse_complement(original_est)
+
       start = cpu()
       #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
       currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
@@ -616,43 +625,3 @@ class PipelineHeuristic:
       self.calcAlignmentScoreTime += stop-start
       
       return score 
-
-
-def cpu():
-   return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
-   resource.getrusage(resource.RUSAGE_SELF).ru_stime) 
-
-
-if __name__ == '__main__':
-   if len(sys.argv) != 6:
-      print 'Usage: ./%s data param run spliced.results unspliced.results' % (sys.argv[0])
-      sys.exit(1)
-
-   data_fname = sys.argv[1]
-   param_fname = sys.argv[2]
-   run_fname = sys.argv[3]
-
-   result_spliced_fname    = sys.argv[4]
-   result_unspliced_fname  = sys.argv[5]
-
-   jp = os.path.join
-
-   ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_spliced_fname,result_unspliced_fname)
-
-   start = cpu()
-   ph1.filter()
-   stop = cpu()
-   print 'total time elapsed: %f' % (stop-start)
-
-   print 'time spend for get seq: %f' % ph1.get_time
-   print 'time spend for calcAlignmentScoreTime: %f' %  ph1.calcAlignmentScoreTime
-   print 'time spend for alternativeScoresTime: %f' % ph1.alternativeScoresTime
-   print 'time spend for count time: %f' % ph1.count_time
-   print 'time spend for init time: %f' % ph1.init_time
-   print 'time spend for main_loop time: %f' % ph1.main_loop
-   print 'time spend for splice_site_time time: %f' % ph1.splice_site_time
-
-   print 'time spend for computeSpliceAlignWithQualityTime time: %f'% ph1.computeSpliceAlignWithQualityTime
-   print 'time spend for computeSpliceWeightsTime time: %f'% ph1.computeSpliceWeightsTime
-   print 'time spend for DotProdTime time: %f'% ph1.DotProdTime
-   print 'time spend forarray_stuff time: %f'% ph1.array_stuff