+ changed code for prediction -> easier and faster
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 23:51:15 +0000 (23:51 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 23:51:15 +0000 (23:51 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8696 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/qpalma_main.py

index 7d278cb..23b66af 100644 (file)
@@ -633,13 +633,11 @@ class QPalma:
 ###############################################################################
 #
 # End of the code needed for training 
-# 
 #
 # Begin of code for prediction
 #
 ###############################################################################
 
-
    def evaluate(self,param_filename):
       run = self.run
       beg = run['prediction_begin']
@@ -647,16 +645,16 @@ class QPalma:
 
       data_filename = self.run['dataset_filename']
 
-      SeqInfo, Exons, OriginalEsts, Qualities,\
-      AlternativeSequences = paths_load_data(data_filename)
+      dataset = cPickle.load(open(data_filename))
+      SeqInfo, OriginalEsts, Qualities  = dataset
 
       #AlternativeSequences = paths_load_data(data_filename,'training',None,self.ARGS)
 
       self.SeqInfo     = SeqInfo
-      self.Exons       = Exons
+      #self.Exons       = Exons
       self.OriginalEsts= OriginalEsts
       self.Qualities   = Qualities
-      self.AlternativeSequences = AlternativeSequences
+      #self.AlternativeSequences = AlternativeSequences
 
       #calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
       #print 'leaving constructor...'
@@ -691,16 +689,15 @@ class QPalma:
          assert(False)
 
       SeqInfo     = self.SeqInfo[beg:end]
-      Exons       = self.Exons[beg:end]
+      #Exons       = self.Exons[beg:end]
       OriginalEsts= self.OriginalEsts[beg:end]
       Qualities   = self.Qualities[beg:end]
-      AlternativeSequences = self.AlternativeSequences[beg:end]
+      #AlternativeSequences = self.AlternativeSequences[beg:end]
 
       # number of training instances
       N = numExamples = len(SeqInfo) 
-      assert len(Exons) == N and len(OriginalEsts) == N\
-      and len(Qualities) == N and len(AlternativeSequences) == N,\
-      'The Exons,Acc,Don,.. arrays are of different lengths'
+      assert len(OriginalEsts) == N\
+      and len(Qualities) == N, 'The Exons,Acc,Don,.. arrays are of different lengths'
 
       self.plog('Number of training examples: %d\n'% numExamples)
 
@@ -739,15 +736,17 @@ class QPalma:
          self.plog('Loading example nr. %d...\n'%exampleIdx)
 
          currentSeqInfo = SeqInfo[exampleIdx]
-         id,chr,strand,up_cut,down_cut,true_cut = currentSeqInfo 
+         #chr,strand,up_cut,down_cut = currentSeqInfo 
 
-         try:
-            dna,est,acc_supp,don_supp,exons,original_est =\
-            getData(SeqInfo,OriginalEsts,Exons,exampleIdx,run)
-         except SpliceSiteException:
-            continue
+         #id,chr,strand,genomicSeq_start,genomicSeq_stop =\
+         #currentSeqInfo 
 
-         dna_len = len(dna)
+         # just for debugging
+         id,chr,strand,genomicSeq_start,genomicSeq_stop =\
+         currentSeqInfo 
+
+         est = OriginalEsts[exampleIdx]
+         est = unbracket_est(est)
 
          if run['mode'] == 'normal':
             quality = [40]*len(est)
@@ -758,70 +757,44 @@ class QPalma:
          if not run['enable_quality_scores']:
             quality = [40]*len(est)
 
-         if not run['enable_splice_signals']:
-            for idx,elem in enumerate(don_supp):
-               if elem != -inf:
-                  don_supp[idx] = 0.0
-
-            for idx,elem in enumerate(acc_supp):
-               if elem != -inf:
-                  acc_supp[idx] = 0.0
-
-         currentAlternatives = AlternativeSequences[exampleIdx]
+         #currentAlternatives = AlternativeSequences[exampleIdx]
          current_example_predictions = []
 
-         # first make a prediction on the dna fragment which comes from the ground truth                  
-         #current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
-         #current_prediction['exampleIdx'] = exampleIdx
-         #current_prediction['id'] = id
-         #current_prediction['start_pos']  = up_cut
-         #current_prediction['label'] = True
-         #current_prediction['true_cut'] = true_cut
-         #current_prediction['chr'] = chr
-         #current_prediction['strand'] = strand
-
-         #current_example_predictions.append(current_prediction)
-
          # then make predictions for all dna fragments that where occurring in
          # the vmatch results
-         for alternative_alignment in currentAlternatives:
-            chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel = alternative_alignment
+         #chr, strand, genomicSeq_start, genomicSeq_stop = alternative_alignment
 
-            if not chr in range(1,6):
-               continue
+         if not chr in range(1,6):
+            continue
 
-            currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+         currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
 
-            if not run['enable_splice_signals']:
-               for idx,elem in enumerate(currentDon):
-                  if elem != -inf:
-                     currentDon[idx] = 0.0
+         if not run['enable_splice_signals']:
+            for idx,elem in enumerate(currentDon):
+               if elem != -inf:
+                  currentDon[idx] = 0.0
 
-               for idx,elem in enumerate(currentAcc):
-                  if elem != -inf:
-                     currentAcc[idx] = 0.0
+            for idx,elem in enumerate(currentAcc):
+               if elem != -inf:
+                  currentAcc[idx] = 0.0
 
-            current_prediction = self.calc_alignment(currentDNASeq, est, exons,\
-            quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
-            current_prediction['exampleIdx'] = exampleIdx
-            current_prediction['id'] = id
-            current_prediction['start_pos'] = up_cut
-            current_prediction['alternative_start_pos'] = genomicSeq_start
-            current_prediction['label'] = currentLabel
-            current_prediction['true_cut'] = true_cut
-            current_prediction['chr'] = chr
-            current_prediction['strand'] = strand
+         current_prediction = self.calc_alignment(currentDNASeq, est,\
+         quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
 
-            current_example_predictions.append(current_prediction)
+         current_prediction['id'] = id
+         #current_prediction['start_pos'] = up_cut
+         current_prediction['start_pos'] = genomicSeq_start
+         current_prediction['chr'] = chr
+         current_prediction['strand'] = strand
 
-         allPredictions.append(current_example_predictions)
+         allPredictions.append(current_prediction)
 
       # end of the prediction loop we save all predictions in a pickle file and exit
       cPickle.dump(allPredictions,open('%s_allPredictions_%s'%(run['name'],set_flag),'w+'))
       print 'Prediction completed'
 
 
-   def calc_alignment(self, dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
+   def calc_alignment(self, dna, est, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
       """
       Given two sequences and the parameters we calculate on alignment
       """
@@ -852,41 +825,15 @@ class QPalma:
       _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
       _newEstAlign = newEstAlign.flatten().tolist()[0]
        
-      #if True:
       alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
       #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
       #self.plog(line1+'\n')
       #self.plog(line2+'\n')
       #self.plog(line3+'\n')
 
-      #currentAlignmentString = ''
-
-      #for idx in range(len(line1)):
-      #   dna_elem    = line1[idx]
-      #   match_elem  = line2[idx]
-      #   est_elem    = line3[idx]
-
-      #   if dna_elem != '' and est_elem != '' and match_elem != '':
-      #      if dna_elem == est_elem:
-      #         currentAlignmentString = '%s%s' % (currentAlignmentString,est_elem)
-
-      #   if dna_elem != '' and est_elem != '' and match_elem == '':
-      #         currentAlignmentString = '%s[%s%s]' %\
-      #         (currentAlignmentString,dna_elem,est_elem)
-
-      #   if dna_elem != '' and est_elem == '_':
-      #         currentAlignmentString = '%s[%s-]' %\
-      #         (currentAlignmentString,dna_elem)
-      #   
-      #   if dna_elem == '_' and est_elem != '':
-      #         currentAlignmentString = '%s[-%s]' %\
-      #         (currentAlignmentString,est_elem)
-            
-
       newExons = self.calculatePredictedExons(newSpliceAlign)
 
-      current_prediction = {'predExons':newExons, 'trueExons':exons,\
-      'dna':dna, 'est':est, 'DPScores':newDPScores,\
+      current_prediction = {'predExons':newExons, 'dna':dna, 'est':est, 'DPScores':newDPScores,\
       'alignment':alignment}
 
       return current_prediction