+ added script to parallelize QPalma heuristic on the cluster
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 3 Jun 2008 10:32:05 +0000 (10:32 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 3 Jun 2008 10:32:05 +0000 (10:32 +0000)
+ added datafile splitting and rejoining method to Utils

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9360 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Utils.py
scripts/grid_heuristic.py [new file with mode: 0644]

index 5117d85..df46f4b 100644 (file)
@@ -140,3 +140,63 @@ def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
 
 def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
    return pu.get_splice_info(_newSpliceAlign,_newEstAlign)
+
+
+##########
+
+
+def split_file_join_results(filename,parts):
+   all_intervals = []
+
+   print 'counting lines'
+   line_ctr = 0
+   for line in open(filename,'r'):
+      line_ctr += 1
+
+   part = line_ctr / parts
+   begin = 0
+   end = 0
+   for idx in range(1,parts+1):
+      
+      if idx == parts:
+         begin = end
+         end   = line_ctr
+      else:
+         begin = end
+         end = begin+part
+
+      params = (begin,end)
+
+      all_intervals.append(params)
+
+   infile = open(filename,'r')
+
+   parts_fn = []
+   for pos,params in enumerate(all_intervals):
+      beg,end = params
+      print params
+      out_fn = '%s.part_%d'%(filename,pos)
+      parts_fn.append(out_fn)
+      #out_fh = open(out_fn,'w+')
+      print out_fn
+
+   parts_fn.reverse()
+   all_intervals.reverse()
+
+   lineCtr = 0
+   beg = -1
+   end = -1
+   for line in open(filename,'r'):
+      if beg <= lineCtr < end:
+         out_fh.write(line)
+         lineCtr += 1
+      else:
+         params = all_intervals.pop()
+         beg,end = params
+         out_fn = parts_fn.pop()
+         out_fh = open(out_fn,'w+')
+
+   out_fh.close()
+
+
+
diff --git a/scripts/grid_heuristic.py b/scripts/grid_heuristic.py
new file mode 100644 (file)
index 0000000..9c211c7
--- /dev/null
@@ -0,0 +1,59 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+import cPickle
+import sys
+import pdb
+import os
+import os.path
+import math
+
+from pythongrid import Job, KybJob, MethodJob, processJobs, Usage, processJobsLocally
+
+from PipelineHeuristic import *
+
+import grid_heuristic
+
+from Utils import split_file_join_results
+
+def g_heuristic(run_fname,data_fname,param_fname,result_fname):
+   #print run_fname,data_fname,param_fname,result_fname
+   ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_fname)
+   ph1.filter()
+   
+
+   return 'finished filtering set %s.' % data_fname
+
+def create_and_submit():
+   jp = os.path.join
+
+   run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
+   data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+
+   run_fname      = jp(run_dir,'run_obj.pickle')
+   data_fname     = jp(data_dir,'map.vm')
+   param_fname    = jp(run_dir,'param_526.pickle')
+
+   functionJobs=[]
+
+   for idx in range(10):
+      data_fname     = jp(data_dir,'map.vm.part_%d'%idx)
+      result_fname   = jp(data_dir,'map.vm.part_%d.heuristic'%idx)
+
+      current_job = KybJob(grid_predict.g_heuristic,[run_fname,data_fname,param_fname,result_fname])
+      current_job.h_vmem = '25.0G'
+      #current_job.express = 'True'
+
+      print "job #1: ", current_job.nativeSpecification
+
+      functionJobs.append(current_job)
+
+   processedFunctionJobs = processJobs(functionJobs)
+   print "ret fields AFTER execution on cluster"
+   for (i, job) in enumerate(processedFunctionJobs):
+      print "Job with id: ", i, "- ret: ", job.ret
+
+
+if __name__ == '__main__':
+   #split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)
+   create_and_submit()