+ minor changes to support transcriptome data
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
index 78f7ebe..5da7bb2 100644 (file)
 
 import cPickle
 import sys
-import pydb
 import pdb
 import os
 import os.path
 import math
+from qpalma.parsers import *
 
-def prediction_on(filename):
-    
-   allPredictions = cPickle.load(open(filename))
-
-   allPositions = {}
-
-
-   for current_example_pred in allPredictions:
-      gt_example = current_example_pred[0]
-
-      exampleId = gt_example['id']
+from Evaluation import load_chunks
 
-      for elem_nr,current_pred in enumerate(current_example_pred[1:]):
-         exampleId   = current_pred['id']
-         chr         = current_pred['chr']
-         strand      = current_pred['strand']
-         true_cut    = current_pred['true_cut']
-         start_pos   = current_pred['alternative_start_pos']
-         alignment   = current_pred['alignment']
-
-         predExons = current_pred['predExons']
-         trueExons = current_pred['trueExons']
-         predPositions = [elem + current_pred['alternative_start_pos'] for elem in predExons]
-         truePositions = [elem + current_pred['start_pos'] for elem in trueExons.flatten().tolist()[0]]
-
-         if len(predExons) == 4:
-            p1 = predExons[0]
-            p2 = predExons[1]
-            p3 = predExons[2]
-            p4 = predExons[3]
-
-            #line = '%d\t%d\t%d\t%d\t%d\n' % (exampleId,p1,p2,p3,p4)
-            #print line
-            allPositions[exampleId] = (chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment)
-
-
-   return allPositions
+def prediction_on(current_dir,filtered_reads,out_fname):
+    
+   import qparser
+   qparser.parse_reads(filtered_reads)
 
+   out_fh = open(out_fname,'w+')
 
-def writePredictions(fname,allPositions):
+   allPredictions = load_chunks(current_dir)
+   allPositions = {}
 
-   out_fh = open(fname,'w+')
+   for current_prediction in allPredictions:
+      id = current_prediction['id']
 
-   allEntries = {}
+      #current_ground_truth = qparser.fetch_read(id)
+      #true_cut = current_ground_truth['true_cut']
+      #seq = current_ground_truth['seq']
+      #q1 = current_ground_truth['prb']
+      seq         = current_prediction['est']
 
-   for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
-      line = line.strip()
-      id,seq,q1,q2,q3 = line.split()
-      id = int(id)
-      
-      allEntries[id] = (seq,q1,q2,q3)
+      # CHECK !!!
+      q1          = 'zzz'
 
+      chr         = current_prediction['chr']
+      strand      = current_prediction['strand']
+      start_pos   = current_prediction['start_pos']
+      alignment   = current_prediction['alignment']
 
-   for id,elems in allPositions.items():
-      id += 1000000000000
-      seq,q1,q2,q3 = allEntries[id]
-      chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment = elems
+      predExons = current_prediction['predExons']
+      predExons = [e+start_pos for e in predExons]
 
-      p1 += start_pos
-      p2 += start_pos
-      p3 += start_pos
-      p4 += start_pos
+      p_start  = current_ground_truth['p_start']
+      e_stop   = current_ground_truth['exon_stop']
+      e_start  = current_ground_truth['exon_start']
+      p_stop   = current_ground_truth['p_stop']
 
-      #pdb.set_trace()
+      correct_prediction = False
 
       (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
       tExonSizes,tStarts, tEnds) = alignment 
 
       new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
-      (id,chr,strand,seq,q1,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+      (id,chr,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
       str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
       str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
       str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
 
       out_fh.write(new_line)
 
-   out_fh.close()
-
-def collect_prediction(current_dir):
-   """
-   Given the toplevel directory this function takes care that for each distinct
-   experiment the training and test predictions are evaluated.
-
-   """
-   train_suffix   = '_allPredictions_TRAIN'
-   test_suffix    = '_allPredictions_TEST'
-
-   run_name = 'run_+_quality_+_splicesignals_+_intron_len_1'
-
-   jp = os.path.join
-   b2s = ['-','+']
-
-   currentRun = cPickle.load(open(jp(current_dir,'run_object_1.pickle')))
-   QFlag    = currentRun['enable_quality_scores']
-   SSFlag   = currentRun['enable_splice_signals']
-   ILFlag   = currentRun['enable_intron_length']
-   currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
-   
-   #filename =  jp(current_dir,run_name)+train_suffix
-   #print 'Prediction on: %s' % filename
-   #prediction_on(filename)
-
-   filename =  jp(current_dir,run_name)+test_suffix
-   print 'Prediction on: %s' % filename
-   test_result = prediction_on(filename)
-
-   fname = 'predictions.txt'
-   writePredictions(fname,test_result)
-
-   #return train_result,test_result,currentRunId
-
+   return allPositions
 
 if __name__ == '__main__':
-   dir = sys.argv[1]
-   #run_name = sys.argv[2]
-   collect_prediction(dir)
+   current_dir = sys.argv[1]
+   filtered_reads = sys.argv[2]
+   out_fh = sys.argv[3]
+   prediction_on(current_dir,filtered_reads,out_fh)