git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8640 e1793c9e...
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
index 3a2a2fc..78f7ebe 100644 (file)
@@ -1,8 +1,6 @@
 #!/usr/bin/env python 
 # -*- coding: utf-8 -*- 
 
-
-
 import cPickle
 import sys
 import pydb
@@ -15,31 +13,81 @@ def prediction_on(filename):
     
    allPredictions = cPickle.load(open(filename))
 
+   allPositions = {}
+
+
    for current_example_pred in allPredictions:
       gt_example = current_example_pred[0]
 
       exampleId = gt_example['id']
-      originalEst = OriginalEsts[exampleIdx]
 
       for elem_nr,current_pred in enumerate(current_example_pred[1:]):
-         current_pred
-         exampleId = curremt_pred['id']
+         exampleId   = current_pred['id']
+         chr         = current_pred['chr']
+         strand      = current_pred['strand']
+         true_cut    = current_pred['true_cut']
+         start_pos   = current_pred['alternative_start_pos']
+         alignment   = current_pred['alignment']
+
          predExons = current_pred['predExons']
          trueExons = current_pred['trueExons']
          predPositions = [elem + current_pred['alternative_start_pos'] for elem in predExons]
          truePositions = [elem + current_pred['start_pos'] for elem in trueExons.flatten().tolist()[0]]
 
-         if predExons.shape == (2,2):
-            p1 = predExons[0,0]
-            p2 = predExons[0,1]
-            p3 = predExons[1,0]
-            p4 = predExons[1,1]
+         if len(predExons) == 4:
+            p1 = predExons[0]
+            p2 = predExons[1]
+            p3 = predExons[2]
+            p4 = predExons[3]
+
+            #line = '%d\t%d\t%d\t%d\t%d\n' % (exampleId,p1,p2,p3,p4)
+            #print line
+            allPositions[exampleId] = (chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment)
+
+
+   return allPositions
+
+
+def writePredictions(fname,allPositions):
+
+   out_fh = open(fname,'w+')
 
-            line = '%d\t%d\t%d\t%d\t%d\n' % (exampleId,p1,p2,p3,p4)
+   allEntries = {}
 
-         print line
+   for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
+      line = line.strip()
+      id,seq,q1,q2,q3 = line.split()
+      id = int(id)
+      
+      allEntries[id] = (seq,q1,q2,q3)
 
-def collect_prediction(current_dir,run_name):
+
+   for id,elems in allPositions.items():
+      id += 1000000000000
+      seq,q1,q2,q3 = allEntries[id]
+      chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment = elems
+
+      p1 += start_pos
+      p2 += start_pos
+      p3 += start_pos
+      p4 += start_pos
+
+      #pdb.set_trace()
+
+      (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+      tExonSizes,tStarts, tEnds) = alignment 
+
+      new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+      (id,chr,strand,seq,q1,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+      str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
+      str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
+      str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
+
+      out_fh.write(new_line)
+
+   out_fh.close()
+
+def collect_prediction(current_dir):
    """
    Given the toplevel directory this function takes care that for each distinct
    experiment the training and test predictions are evaluated.
@@ -48,27 +96,32 @@ def collect_prediction(current_dir,run_name):
    train_suffix   = '_allPredictions_TRAIN'
    test_suffix    = '_allPredictions_TEST'
 
+   run_name = 'run_+_quality_+_splicesignals_+_intron_len_1'
+
    jp = os.path.join
    b2s = ['-','+']
 
-   currentRun = cPickle.load(open(jp(current_dir,'run_object.pickle')))
+   currentRun = cPickle.load(open(jp(current_dir,'run_object_1.pickle')))
    QFlag    = currentRun['enable_quality_scores']
    SSFlag   = currentRun['enable_splice_signals']
    ILFlag   = currentRun['enable_intron_length']
    currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
    
-   filename =  jp(current_dir,run_name)+train_suffix
-   print 'Prediction on: %s' % filename
-   train_result = prediction_on(filename)
+   #filename =  jp(current_dir,run_name)+train_suffix
+   #print 'Prediction on: %s' % filename
+   #prediction_on(filename)
 
    filename =  jp(current_dir,run_name)+test_suffix
    print 'Prediction on: %s' % filename
    test_result = prediction_on(filename)
 
-   return train_result,test_result,currentRunId
+   fname = 'predictions.txt'
+   writePredictions(fname,test_result)
+
+   #return train_result,test_result,currentRunId
 
 
 if __name__ == '__main__':
    dir = sys.argv[1]
-   run_name = sys.argv[2]
-   collect_prediction(dir,run_name)
+   #run_name = sys.argv[2]
+   collect_prediction(dir)