+ added selection option for alignment format
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 12 Jun 2008 12:36:11 +0000 (12:36 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 12 Jun 2008 12:36:11 +0000 (12:36 +0000)
+ added pythongrid script for alignment output

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9579 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/createAlignmentFileFromPrediction.py
scripts/grid_alignment.py [new file with mode: 0644]

index 9e97190..54d3beb 100644 (file)
@@ -9,20 +9,15 @@ import os.path
 import math
 import numpy
 
-#from qpalma.parsers import *
-
-from Evaluation import load_chunks
 import qparser
 
-
-def create_aglignment_file(allPredictions,out_fname):
+def create_alignment_file(current_loc,out_fname):
    import numpy
    #qparser.parse_reads(filtered_reads)
 
    out_fh = open(out_fname,'w+')
 
-   #allPredictionsmllPredictions = load_chunks(current_loc)
-   #allPredictions = cPickle.load(open(current_loc))
+   allPredictions = cPickle.load(open(current_loc))
 
    allPositions = {}
 
@@ -45,83 +40,80 @@ def create_aglignment_file(allPredictions,out_fname):
       start_pos   = current_prediction['start_pos']
       alignment   = current_prediction['alignment']
 
+      DPScores   = current_prediction['DPScores']
+
       predExons = current_prediction['predExons']
       predExons = [e+start_pos for e in predExons]
 
-      #p_start  = current_ground_truth['p_start']
-      #e_stop   = current_ground_truth['exon_stop']
-      #e_start  = current_ground_truth['exon_start']
-      #p_stop   = current_ground_truth['p_stop']
-
-      # &match, &mismatch, &repmatch, &Ns, &Qgapcnt, &Qgapbs, &Tgapcnt, &Tgapbs,                                                                                                                                              
-      predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
-      #predExons = predExons.T
-
-      #print predExons
-      #pdb.set_trace()
-
-      match = predExons[0,1] - predExons[0,0] 
-      if predExons.shape == (2,2):
-         match += predExons[1,1] - predExons[1,0] 
-
-      mismatch = 0
-      repmatch = 0
-      Ns = 0
-      Qgapcnt = 0
-      Qgapbs = 0
-
-      Tgapcnt = 0
-      Tgapbs = 0
-
-      if predExons.shape == (2,2):
-         Tgapbs += predExons[1,0] - predExons[0,1]
-         Tgapcnt = 1
-
-      # &strand, Qname, &Qsize, 
-      
-      # &Qstart, &Qend, Tname, &Tsize,
-      # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
 
       (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
       tExonSizes,tStarts, tEnds) = alignment 
 
-      #
-      # ATTENTION
-      #  
-      # we enforce equal exons sizes for q and t because we are missing indel
-      # positions of the alignment
-
-      if qExonSizes[0] != tExonSizes[0]:
-         continue
-
-      Qname = str(id)
-      Qsize = len(seq)
-      Qstart = qStart
-      Qend = qEnd
-      Tname = 'CHR%d'%chromo
-
-      start_pos -= 2
-
-      Tsize = tEnd+1 + start_pos
-      Tstart =  tStart + start_pos
-      Tend = tEnd + start_pos
-      blockcnt = Tgapcnt+1
-      exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
-      Qstarts_str = str(qStarts)[1:-1].replace(' ','')
-      Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
-
-      #new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
-      #(id,chr,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
-      #str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
-      #str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
-      #str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
-
-      new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
-      % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
-      Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
-      Tname, Tsize, Tstart, Tend,\
-      blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
-
+      
+      EST2GFF = False
+      if EST2GFF:
+         predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
+
+         match = predExons[0,1] - predExons[0,0] 
+         if predExons.shape == (2,2):
+            match += predExons[1,1] - predExons[1,0] 
+
+         mismatch = 0
+         repmatch = 0
+         Ns = 0
+         Qgapcnt = 0
+         Qgapbs = 0
+
+         Tgapcnt = 0
+         Tgapbs = 0
+
+         if predExons.shape == (2,2):
+            Tgapbs += predExons[1,0] - predExons[0,1]
+            Tgapcnt = 1
+
+         # &strand, Qname, &Qsize, 
+         # &Qstart, &Qend, Tname, &Tsize,
+         # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
+
+         #
+         # ATTENTION
+         #  
+         # we enforce equal exons sizes for q and t because we are missing indel
+         # positions of the alignment
+
+         if qExonSizes[0] != tExonSizes[0]:
+            continue
+
+         Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
+         Qsize = len(seq)
+         Qstart = qStart
+         Qend = qEnd
+         Tname = 'CHR%d'%chromo
+
+         start_pos -= 2
+
+         Tsize = tEnd+1 + start_pos
+         Tstart =  tStart + start_pos
+         Tend = tEnd + start_pos
+         blockcnt = Tgapcnt+1
+         exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
+         Qstarts_str = str(qStarts)[1:-1].replace(' ','')
+         Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
+
+      try:
+         new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+         (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+         str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
+         str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
+         str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
+      except:
+         pdb.set_trace()
+
+      #new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
+      #% (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
+      #Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
+      #Tname, Tsize, Tstart, Tend,\
+      #blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
 
       out_fh.write(new_line)
 
@@ -130,4 +122,4 @@ def create_aglignment_file(allPredictions,out_fname):
 if __name__ == '__main__':
    current_dir = sys.argv[1]
    out_fh = sys.argv[2]
-   create_aglignment_file(current_dir,out_fh)
+   create_alignment_file(current_dir,out_fh)
diff --git a/scripts/grid_alignment.py b/scripts/grid_alignment.py
new file mode 100644 (file)
index 0000000..c2b8cff
--- /dev/null
@@ -0,0 +1,60 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+import cPickle
+import sys
+import pdb
+import os
+import os.path
+import math
+
+from pythongrid import Job, KybJob, MethodJob, processJobs, Usage, processJobsLocally
+
+from createAlignmentFileFromPrediction import *
+
+import grid_alignment
+
+from Utils import split_file_join_results
+
+
+def g_alignment(chunk_fn,result_fn):
+   create_alignment_file(chunk_fn,result_fn)
+
+
+def create_and_submit():
+   jp = os.path.join
+
+   run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
+   data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+
+   chunks_fn = []
+   for fn in os.listdir(run_dir):
+      if fn.startswith('chunk'):
+         chunks_fn.append(fn)
+
+   functionJobs=[]
+
+   for chunk_fn in chunks_fn:
+      chunk_name  = chunk_fn[:chunk_fn.find('.')]
+      result_fn   = jp(data_dir,'%s.align_remap'%chunk_name)
+      chunk_fn = jp(run_dir,chunk_fn) 
+
+      #pdb.set_trace()
+
+      current_job = KybJob(grid_alignment.g_alignment,[chunk_fn,result_fn])
+      current_job.h_vmem = '15.0G'
+      #current_job.express = 'True'
+
+      print "job #1: ", current_job.nativeSpecification
+
+      functionJobs.append(current_job)
+
+   processedFunctionJobs = processJobs(functionJobs)
+   print "ret fields AFTER execution on cluster"
+   for (i, job) in enumerate(processedFunctionJobs):
+      print "Job with id: ", i, "- ret: ", job.ret
+
+
+if __name__ == '__main__':
+   #split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)
+   create_and_submit()