+ fixed missing newlines in output
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 17:41:38 +0000 (17:41 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 17:41:38 +0000 (17:41 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8691 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/PipelineHeuristic.py

index 3686991..8e8b6d5 100644 (file)
@@ -21,12 +21,14 @@ from qpalma.Plif import Plf
 #from qpalma.tools.splicesites import getDonAccScores
 from qpalma.Configuration import *
 
-from compile_dataset import getSpliceScores, get_seq_and_scores
+#from compile_dataset import getSpliceScores, get_seq_and_scores
 
 from numpy.matlib import mat,zeros,ones,inf
 from numpy import inf,mean
 
-from qpalma.parsers import PipelineReadParser
+from qpalma.parsers import *
+
+from Lookup import *
 
 
 def unbracket_est(est):
@@ -62,6 +64,20 @@ class PipelineHeuristic:
       run = cPickle.load(open(run_fname))
       self.run = run
 
+      dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+
+      start = cpu()
+      self.all_remapped_reads = parse(data_fname)
+      stop = cpu()
+
+      print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
+
+
+      start = cpu()
+      self.lt1 = Lookup(dna_flat_files)
+      stop = cpu()
+      print 'prefetched sequence and splice data in %f sec' % (stop-start)
+
       self.result_spliced_fh = open(result_spliced_fname,'w+')
       self.result_unspliced_fh = open(result_unspliced_fname,'w+')
 
@@ -83,11 +99,30 @@ class PipelineHeuristic:
       self.read_size    = 36
 
       # parameters of the heuristics to decide whether the read is spliced
-      self.splice_thresh      = 0.005
+      #self.splice_thresh      = 0.005
+      self.splice_thresh      = 0.01
       self.max_intron_size    = 2000
       self.max_mismatch       = 2 
-      self.splice_stop_thresh = 0.99
+      #self.splice_stop_thresh = 0.99
+      self.splice_stop_thresh = 0.8
       self.spliced_bias       = 0.0
+
+      param_lines = [\
+      ('%f','splice_thresh',self.splice_thresh),\
+      ('%d','max_intron_size',self.max_intron_size),\
+      ('%d','max_mismatch',self.max_mismatch),\
+      ('%f','splice_stop_thresh',self.splice_stop_thresh),\
+      ('%f','spliced_bias',self.spliced_bias)]
+
+      param_lines = [('# %s: '+form+'\n')%(name,val) for form,name,val in param_lines]
+      param_lines.append('# data: %s\n'%self.data_fname)
+      param_lines.append('# param: %s\n'%param_fname)
+
+      #pdb.set_trace()
+
+      for p_line in param_lines:
+         self.result_spliced_fh.write(p_line)
+         self.result_unspliced_fh.write(p_line)
    
       self.original_reads = {}
 
@@ -131,7 +166,6 @@ class PipelineHeuristic:
       self.alternativeScoresTime = 0.0
 
       self.count_time = 0.0
-      self.read_parsing = 0.0
       self.main_loop = 0.0
       self.splice_site_time = 0.0
       self.computeSpliceAlignWithQualityTime = 0.0
@@ -148,14 +182,6 @@ class PipelineHeuristic:
       """
       run = self.run 
 
-      start = cpu()
-
-      rrp = PipelineReadParser(self.data_fname)
-      all_remapped_reads = rrp.parse()
-
-      stop = cpu()
-
-      self.read_parsing = stop-start
 
       ctr = 0
       unspliced_ctr  = 0
@@ -164,114 +190,113 @@ class PipelineHeuristic:
       print 'Starting filtering...'
       _start = cpu()
 
-      for readId,currentReadLocations in all_remapped_reads.items():
-         for location in currentReadLocations[:1]:
+      #for readId,currentReadLocations in all_remapped_reads.items():
+         #for location in currentReadLocations[:1]:
 
-            id       = location['id']
-            chr      = location['chr']
-            pos      = location['pos']
-            strand   = location['strand']
-            mismatch = location['mismatches']
-            length   = location['length']
-            off      = location['offset']
-            seq      = location['seq']
-            prb      = location['prb']
-            cal_prb  = location['cal_prb']
-            chastity = location['chastity']
+      for location,original_line in self.all_remapped_reads:
+         
+         if ctr % 1000 == 0:
+            print ctr
+
+
+         id       = location['id']
+         chr      = location['chr']
+         pos      = location['pos']
+         strand   = location['strand']
+         mismatch = location['mismatches']
+         length   = location['length']
+         off      = location['offset']
+         seq      = location['seq']
+         prb      = location['prb']
+         cal_prb  = location['cal_prb']
+         chastity = location['chastity']
 
-            id = int(id)
+         id = int(id)
 
-            strand_map = {'+':'D', '-':'P'}
+         seq = seq.lower()
 
+         strand_map = {'D':'+', 'P':'-'}
 
-            original_line = '%d\t%d\t%d\t%s\t%d\t%d\t%d\t%s\t%s\t%s\t%s\n' %\
-            (id,chr,pos,strand_map[strand],int(mismatch),int(length),int(off),seq.upper(),prb,cal_prb,chastity)
+         strand = strand_map[strand]
 
+         if strand == '-':
+            continue
 
-            if strand == '-':
-               continue
+         unb_seq = unbracket_est(seq)
+         effective_len = len(unb_seq)
 
-            if ctr == 1000:
-               break
+         genomicSeq_start  = pos
+         genomicSeq_stop   = pos+effective_len-1
 
-            #if pos > 10000000:
-            #   continue
-      
-            unb_seq = unbracket_est(seq)
-            effective_len = len(unb_seq)
+         start = cpu()
+         #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+         currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+         stop = cpu()
+         self.get_time += stop-start
 
-            genomicSeq_start  = pos
-            genomicSeq_stop   = pos+effective_len-1
+         dna            = currentDNASeq
+         exons          = zeros((2,1))
+         exons[0,0]     = 0
+         exons[1,0]     = effective_len
+         est            = unb_seq
+         original_est   = seq
+         quality        = prb
 
-            start = cpu()
-            #print genomicSeq_start,genomicSeq_stop
-            currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
-            stop = cpu()
-            self.get_time += stop-start
+         #pdb.set_trace()
 
-            dna            = currentDNASeq
-            exons          = zeros((2,1))
-            exons[0,0]     = 0
-            exons[1,0]     = effective_len
-            est            = unb_seq
-            original_est   = seq
-            quality        = prb
+         currentVMatchAlignment = dna, exons, est, original_est, quality,\
+         currentAcc, currentDon
 
-            #pdb.set_trace()
+         alternativeAlignmentScores = self.calcAlternativeAlignments(location)
 
-            currentVMatchAlignment = dna, exons, est, original_est, quality,\
-            currentAcc, currentDon
+         if alternativeAlignmentScores == []:
+             # no alignment necessary
+             maxAlternativeAlignmentScore = -inf
+             vMatchScore = 0.0 
+         else:
+             maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
+             # compute alignment for vmatch unspliced read
+             vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
 
-            alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+         start = cpu()
+         
+         #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
+         #print 'all candidates %s' % str(alternativeAlignmentScores)
 
-            if alternativeAlignmentScores == []:
-                # no alignment necessary
-                maxAlternativeAlignmentScore = -inf
-                vMatchScore = 0.0 
-            else:
-                maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
-                # compute alignment for vmatch unspliced read
-                vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
+         new_id = id - 1000000300000
 
-            start = cpu()
-            
-            #print 'vMatchScore/alternativeScore: %f %f ' % (vMatchScore,maxAlternativeAlignmentScore)
-            #print 'all candidates %s' % str(alternativeAlignmentScores)
+         unspliced = False
+         # unspliced 
+         if new_id > 0: 
+            unspliced = True
 
-            new_id = id - 1000000300000
+         # Seems that according to our learned parameters VMatch found a good
+         # alignment of the current read
+         if maxAlternativeAlignmentScore < vMatchScore:
+            unspliced_ctr += 1
 
-            unspliced = False
-            # unspliced 
-            if new_id > 0: 
-               unspliced = True
+            self.result_unspliced_fh.write(original_line+'\n') 
 
-            # Seems that according to our learned parameters VMatch found a good
-            # alignment of the current read
-            if maxAlternativeAlignmentScore < vMatchScore:
-               unspliced_ctr += 1
+            if unspliced:
+               self.true_neg += 1
+            else:
+               self.false_neg += 1
 
-               self.result_unspliced_fh.write(original_line) 
+         # We found an alternative alignment considering splice sites that scores
+         # higher than the VMatch alignment
+         else:
+            spliced_ctr += 1
 
-               if unspliced:
-                  self.true_neg += 1
-               else:
-                  self.false_neg += 1
+            self.result_spliced_fh.write(original_line+'\n')
 
-            # We found an alternative alignment considering splice sites that scores
-            # higher than the VMatch alignment
+            if unspliced:
+               self.false_pos += 1
             else:
-               spliced_ctr += 1
-
-               self.result_spliced_fh.write(original_line)
-
-               if unspliced:
-                  self.false_pos += 1
-               else:
-                  self.true_pos += 1
+               self.true_pos += 1
 
-            ctr += 1
-            stop = cpu()
-            self.count_time = stop-start
+         ctr += 1
+         stop = cpu()
+         self.count_time = stop-start
 
       _stop = cpu()
       self.main_loop = _stop-_start
@@ -343,14 +368,19 @@ class PipelineHeuristic:
       quality      = location['prb']
       cal_prb  = location['cal_prb']
       
+      original_est = original_est.lower()
       est = unbracket_est(original_est)
       effective_len = len(est)
 
       genomicSeq_start  = pos - self.max_intron_size
       genomicSeq_stop   = pos + self.max_intron_size + len(est)
 
+      strand_map = {'D':'+', 'P':'-'}
+      strand = strand_map[strand]
+
       start = cpu()
-      currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
+      #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
+      currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
       stop = cpu()
       self.get_time += stop-start
       dna   = currentDNASeq
@@ -560,37 +590,38 @@ def cpu():
 
 
 if __name__ == '__main__':
-   #run_fname = sys.argv[1]
-   #data_fname = sys.argv[2]
-   #param_filename = sys.argv[3]
+   if len(sys.argv) != 6:
+      print 'Usage: ./%s data param run spliced.results unspliced.results' % (sys.argv[0])
 
-   dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test/run_+_quality_+_splicesignals_+_intron_len'
-   jp = os.path.join
+   data_fname = sys.argv[1]
+   param_fname = sys.argv[2]
+   run_fname = sys.argv[3]
 
-   run_fname   = jp(dir,'run_object.pickle')
-   data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_1k'
+   result_spliced_fname    = sys.argv[4]
+   result_unspliced_fname  = sys.argv[5]
 
+   jp = os.path.join
+
+   #dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test/run_+_quality_+_splicesignals_+_intron_len'
+   #param_fname = jp(dir,'param_500.pickle')
+   #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_20k'
    #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
    #data_fname  = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
-
-   param_fname = jp(dir,'param_500.pickle')
-
-   result_spliced_fname    = 'splicedReads.heuristic'
-   result_unspliced_fname  = 'unsplicedReads.heuristic'
+   #result_spliced_fname    = 'splicedReads.heuristic'
+   #result_unspliced_fname  = 'unsplicedReads.heuristic'
 
    ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_spliced_fname,result_unspliced_fname)
 
    start = cpu()
    ph1.filter()
    stop = cpu()
-
    print 'total time elapsed: %f' % (stop-start)
+
    print 'time spend for get seq: %f' % ph1.get_time
    print 'time spend for calcAlignmentScoreTime: %f' %  ph1.calcAlignmentScoreTime
    print 'time spend for alternativeScoresTime: %f' % ph1.alternativeScoresTime
    print 'time spend for count time: %f' % ph1.count_time
    print 'time spend for init time: %f' % ph1.init_time
-   print 'time spend for read_parsing time: %f' % ph1.read_parsing
    print 'time spend for main_loop time: %f' % ph1.main_loop
    print 'time spend for splice_site_time time: %f' % ph1.splice_site_time
 
@@ -598,9 +629,3 @@ if __name__ == '__main__':
    print 'time spend for computeSpliceWeightsTime time: %f'% ph1.computeSpliceWeightsTime
    print 'time spend for DotProdTime time: %f'% ph1.DotProdTime
    print 'time spend forarray_stuff time: %f'% ph1.array_stuff
-   #import cProfile
-   #cProfile.run('ph1.filter()')
-
-   #import hotshot
-   #p = hotshot.Profile('profile.log')
-   #p.runcall(ph1.filter)