+ changed alignment output format to be more blat-like
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 17 Apr 2008 09:54:19 +0000 (09:54 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 17 Apr 2008 09:54:19 +0000 (09:54 +0000)
+ performed prediction to see whether changes are correct

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8561 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Evaluation.py
scripts/Utils.py
scripts/createAlignmentFileFromPrediction.py
scripts/qpalma_main.py

index bc30d59..7fa0363 100644 (file)
@@ -296,7 +296,8 @@ def perform_prediction(current_dir,run_name):
    This function takes care of starting the jobs needed for the prediction phase
    of qpalma
    """
-   for i in range(1,6):
+   #for i in range(1,6):
+   for i in range(1,2):
       cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s %d |\
       qsub -l h_vmem=12.0G -cwd -j y -N \"%s_%d.log\"'%(current_dir,i,run_name,i)
 
@@ -327,12 +328,12 @@ def forall_experiments(current_func,tl_dir):
 
       current_func(current_dir,run_name)
 
-      train_result,test_result,currentRunId = current_func(current_dir,run_name)
-      all_results[currentRunId] = test_result
-      pos_error,score_error,gt_error,incorrect_gt_cuts,incorrect_vmatch_cuts = test_result
-      all_error_rates[currentRunId] = (incorrect_gt_cuts,incorrect_vmatch_cuts)
+      #train_result,test_result,currentRunId = current_func(current_dir,run_name)
+      #all_results[currentRunId] = test_result
+      #pos_error,score_error,gt_error,incorrect_gt_cuts,incorrect_vmatch_cuts = test_result
+      #all_error_rates[currentRunId] = (incorrect_gt_cuts,incorrect_vmatch_cuts)
 
-   createErrorVSCutPlot(all_error_rates)
+   #createErrorVSCutPlot(all_error_rates)
    #createTable(all_results)
 
 
@@ -344,5 +345,5 @@ if __name__ == '__main__':
    #data_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test_new'
    #data = cPickle.load(open(data_fn))
 
-   #forall_experiments(perform_prediction,dir)
-   forall_experiments(collect_prediction,dir)
+   forall_experiments(perform_prediction,dir)
+   #forall_experiments(collect_prediction,dir)
index f87103f..c17fee9 100644 (file)
@@ -131,3 +131,7 @@ def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
    line3 = "".join(est_array)
 
    return line1,line2,line3
+
+
+def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
+   return pu.get_splice_info(_newSpliceAlign,_newEstAlign)
index e089626..a8fdb18 100644 (file)
@@ -27,6 +27,7 @@ def prediction_on(filename):
          strand      = current_pred['strand']
          true_cut    = current_pred['true_cut']
          start_pos   = current_pred['alternative_start_pos']
+         alignment   = current_pred['alignment']
 
          predExons = current_pred['predExons']
          trueExons = current_pred['trueExons']
@@ -41,7 +42,7 @@ def prediction_on(filename):
 
             #line = '%d\t%d\t%d\t%d\t%d\n' % (exampleId,p1,p2,p3,p4)
             #print line
-            allPositions[exampleId] = (chr,strand,start_pos,true_cut,p1,p2,p3,p4)
+            allPositions[exampleId] = (chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment)
 
 
    return allPositions
@@ -62,16 +63,23 @@ def writePredictions(fname,allPositions):
 
 
    for id,elems in allPositions.items():
-      seq,q1,q2,q3 = allEntries[1000000000000+id]
-      chr,strand,start_pos,true_cut,p1,p2,p3,p4 = elems
+      id += 1000000000000
+      seq,q1,q2,q3 = allEntries[id]
+      chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment = elems
 
       p1 += start_pos
       p2 += start_pos
       p3 += start_pos
       p4 += start_pos
 
-      new_line = '%d\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n' %\
-      (id,chr,strand,seq,q1,q2,q3,true_cut,p1,p2,p3,p4)
+      #pdb.set_trace()
+
+      (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+      tExonSizes,tStarts, tEnds) = alignment 
+
+      new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+      (id,chr,strand,seq,q1,start_pos,qStart,qEnd,tStart,tEnd,num_exons,str(qExonSizes)[1:-1],str(qStarts)[1:-1],str(qEnds)[1:-1],\
+      str(tExonSizes)[1:-1],str(tStarts)[1:-1],str(tEnds)[1:-1])
 
       out_fh.write(new_line)
 
index b7fbf7f..0e160a6 100644 (file)
@@ -48,7 +48,7 @@ from qpalma.Configuration import *
 # functions
 from Genefinding import *
 from genome_utils import load_genomic
-from Utils import calc_stat, calc_info, pprint_alignment
+from Utils import calc_stat, calc_info, pprint_alignment, get_alignment
 
 class SpliceSiteException:
    pass
@@ -572,10 +572,10 @@ class QPalma:
                   _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
                   _newEstAlign = newEstAlign[0].flatten().tolist()[0]
 
-                  line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
-                  self.plog(line1+'\n')
-                  self.plog(line2+'\n')
-                  self.plog(line3+'\n')
+                  #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
+                  #self.plog(line1+'\n')
+                  #self.plog(line2+'\n')
+                  #self.plog(line3+'\n')
 
                # if there is at least one useful false alignment add the
                # corresponding constraints to the optimization problem
@@ -873,16 +873,42 @@ class QPalma:
       _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
       _newEstAlign = newEstAlign.flatten().tolist()[0]
        
-      if False:
-         line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
-         self.plog(line1+'\n')
-         self.plog(line2+'\n')
-         self.plog(line3+'\n')
+      #if True:
+      alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
+      #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
+      #self.plog(line1+'\n')
+      #self.plog(line2+'\n')
+      #self.plog(line3+'\n')
+
+      #currentAlignmentString = ''
+
+      #for idx in range(len(line1)):
+      #   dna_elem    = line1[idx]
+      #   match_elem  = line2[idx]
+      #   est_elem    = line3[idx]
+
+      #   if dna_elem != '' and est_elem != '' and match_elem != '':
+      #      if dna_elem == est_elem:
+      #         currentAlignmentString = '%s%s' % (currentAlignmentString,est_elem)
+
+      #   if dna_elem != '' and est_elem != '' and match_elem == '':
+      #         currentAlignmentString = '%s[%s%s]' %\
+      #         (currentAlignmentString,dna_elem,est_elem)
+
+      #   if dna_elem != '' and est_elem == '_':
+      #         currentAlignmentString = '%s[%s-]' %\
+      #         (currentAlignmentString,dna_elem)
+      #   
+      #   if dna_elem == '_' and est_elem != '':
+      #         currentAlignmentString = '%s[-%s]' %\
+      #         (currentAlignmentString,est_elem)
+            
 
       newExons = self.calculatePredictedExons(newSpliceAlign)
 
       current_prediction = {'predExons':newExons, 'trueExons':exons,\
-      'dna':dna, 'est':est, 'DPScores':newDPScores}
+      'dna':dna, 'est':est, 'DPScores':newDPScores,\
+      'alignment':alignment}
 
       return current_prediction