+ renamed dyn_prog directory
[qpalma.git] / scripts / grid_predict.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import time
7 import pdb
8 import os
9 import os.path
10 import math
11
12 from pythongrid import KybJob, Usage
13 from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
14
15 from qpalma_main import *
16
17 import grid_predict
18
19
20 def get_slices(dataset_size,num_nodes):
21 all_instances = []
22
23 part = dataset_size / num_nodes
24 begin = 0
25 end = 0
26 for idx in range(1,num_nodes+1):
27
28 if idx == num_nodes:
29 begin = end
30 end = dataset_size
31 else:
32 begin = end
33 end = begin+part
34
35 params = (begin,end)
36
37 all_instances.append(params)
38
39 return all_instances
40
41
42 def makeJobs(run,dataset_fn,chunks,param_fn):
43 """
44 """
45
46 jobs=[]
47
48 # fetch the data needed
49 g_dir = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
50 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
51 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
52
53 g_fmt = 'chr%d.dna.flat'
54 s_fmt = 'contig_%d%s'
55
56 num_chromo = 6
57
58 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
59 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
60
61 for c_name,current_chunk in chunks:
62 current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param_fn,seqInfo,c_name])
63 current_job.h_vmem = '20.0G'
64 #current_job.express = 'True'
65
66 print "job #1: ", current_job.nativeSpecification
67
68 jobs.append(current_job)
69
70 return jobs
71
72
73 def create_and_submit():
74 """
75
76 """
77
78 jp = os.path.join
79
80 run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
81
82 run = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
83 run['name'] = 'saved_run'
84
85 param_fn = jp(run_dir,'param_526.pickle')
86
87 #run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/prediction'
88 #dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset/dataset_run_1.pickle.pickle'
89 #prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset/dataset_run_1.pickle.keys.pickle'
90
91 run['result_dir'] = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/prediction'
92 dataset_fn = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/dataset_run_1.pickle'
93 prediction_keys_fn = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/dataset_run_1.keys.pickle'
94
95 prediction_keys = cPickle.load(open(prediction_keys_fn))
96
97 print 'Found %d keys for prediction.' % len(prediction_keys)
98
99 num_splits = 50
100 #num_splits = 1
101 slices = get_slices(len(prediction_keys),num_splits)
102 chunks = []
103 for idx,slice in enumerate(slices):
104 #if idx != 0:
105 c_name = 'chunk_%d' % idx
106 chunks.append((c_name,prediction_keys[slice[0]:slice[1]]))
107
108 functionJobs = makeJobs(run,dataset_fn,chunks,param_fn)
109
110 sum = 0
111 for size in [len(elem) for name,elem in chunks]:
112 sum += size
113
114 print 'Got %d job(s)' % len(functionJobs)
115
116 #print "output ret field in each job before sending it onto the cluster"
117 #for (i, job) in enumerate(functionJobs):
118 # print "Job with id: ", i, "- ret: ", job.ret
119 #print ""
120 #print "sending function jobs to cluster"
121 #print ""
122
123 (sid, jobids) = submit_jobs(functionJobs)
124
125
126 def g_predict(run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name):
127 """
128
129 """
130
131 qp = QPalma(False)
132 qp.init_prediction(run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name)
133 return 'finished prediction of set %s.' % set_name
134
135
136 if __name__ == '__main__':
137 create_and_submit()