+ minor changes to support transcriptome data
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
index add55ec..5da7bb2 100644 (file)
@@ -9,52 +9,50 @@ import os.path
 import math
 from qpalma.parsers import *
 
-def prediction_on(filename,filtered_reads,out_fname):
+from Evaluation import load_chunks
+
+def prediction_on(current_dir,filtered_reads,out_fname):
     
-   print 'parsing filtered reads..'
-   all_filtered_reads = parse_filtered_reads(filtered_reads)
-   print 'found %d filtered reads' % len(all_filtered_reads)
+   import qparser
+   qparser.parse_reads(filtered_reads)
 
    out_fh = open(out_fname,'w+')
 
-   allPredictions = cPickle.load(open(filename))
+   allPredictions = load_chunks(current_dir)
    allPositions = {}
 
    for current_prediction in allPredictions:
       id = current_prediction['id']
-      current_ground_truth = all_filtered_reads[id]
 
-      true_cut = current_ground_truth['true_cut']
-      seq = current_ground_truth['seq']
-      q1 = current_ground_truth['prb']
+      #current_ground_truth = qparser.fetch_read(id)
+      #true_cut = current_ground_truth['true_cut']
+      #seq = current_ground_truth['seq']
+      #q1 = current_ground_truth['prb']
+      seq         = current_prediction['est']
+
+      # CHECK !!!
+      q1          = 'zzz'
 
       chr         = current_prediction['chr']
       strand      = current_prediction['strand']
-      #true_cut    = current_prediction['true_cut']
       start_pos   = current_prediction['start_pos']
       alignment   = current_prediction['alignment']
 
       predExons = current_prediction['predExons']
+      predExons = [e+start_pos for e in predExons]
 
-      if len(predExons) == 4:
-         p1 = predExons[0]
-         p2 = predExons[1]
-         p3 = predExons[2]
-         p4 = predExons[3]
-
-      elif len(predExons) == 2:
-         p1 = predExons[0]
-         p2 = -1
-         p3 = -1
-         p4 = predExons[1]
+      p_start  = current_ground_truth['p_start']
+      e_stop   = current_ground_truth['exon_stop']
+      e_start  = current_ground_truth['exon_start']
+      p_stop   = current_ground_truth['p_stop']
 
-      #allPositions[exampleId] = (chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment)
+      correct_prediction = False
 
       (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
       tExonSizes,tStarts, tEnds) = alignment 
 
       new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
-      (id,chr,strand,seq,q1,start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+      (id,chr,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
       str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
       str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
       str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
@@ -63,63 +61,8 @@ def prediction_on(filename,filtered_reads,out_fname):
 
    return allPositions
 
-
-def writePredictions(fname,allPositions):
-
-
-   allEntries = {}
-
-   for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
-      line = line.strip()
-      id,seq,q1,q2,q3 = line.split()
-      id = int(id)
-      
-      allEntries[id] = (seq,q1,q2,q3)
-
-
-   for id,elems in allPositions.items():
-      seq,q1,q2,q3 = allEntries[id]
-      chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment = elems
-
-      p1 += start_pos
-      p2 += start_pos
-      p3 += start_pos
-      p4 += start_pos
-
-
-
-   out_fh.close()
-
-def collect_prediction(current_dir):
-   """
-   Given the toplevel directory this function takes care that for each distinct
-   experiment the training and test predictions are evaluated.
-
-   """
-   train_suffix   = '_allPredictions_TRAIN'
-   test_suffix    = '_allPredictions_TEST'
-
-   run_name = 'run_+_quality_+_splicesignals_+_intron_len_1'
-
-   jp = os.path.join
-   b2s = ['-','+']
-
-   #currentRun = cPickle.load(open(jp(current_dir,'run_object_1.pickle')))
-   #QFlag    = currentRun['enable_quality_scores']
-   #SSFlag   = currentRun['enable_splice_signals']
-   #ILFlag   = currentRun['enable_intron_length']
-   #currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
-   
-   filename =  jp(current_dir,run_name)+test_suffix
-   print 'Prediction on: %s' % filename
-   test_result = prediction_on(filename)
-
-   fname = 'predictions.txt'
-   writePredictions(fname,test_result)
-
-
 if __name__ == '__main__':
-   filename = sys.argv[1]
+   current_dir = sys.argv[1]
    filtered_reads = sys.argv[2]
    out_fh = sys.argv[3]
-   prediction_on(filename,filtered_reads,out_fh)
+   prediction_on(current_dir,filtered_reads,out_fh)