from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
+translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
+
+
def alignment_reconstruct(current_prediction,num_exons):
"""
We reconstruct the exact alignment to infer the positions of indels and the
sizes of the respective exons.
"""
- translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
SpliceAlign = current_prediction['spliceAlign']
estAlign = current_prediction['estAlign']
return exonIdentities,exonGaps
-def createGenefindingOutput(current_loc,out_fname):
+def getUniquePrediction(allPredictions):
"""
+ Since one read can have several alignments due to several possible seed
+ positions we have to preprocess all prediction. This function return for
+ every read only one alignment that is the highest scoring under the QPalma
+ model.
"""
- pass
-
-def createShoreOutput(current_loc,out_fname):
- """
- """
- pass
-
-def createBLATOutput(current_loc,out_fname):
-
- out_fh = open(out_fname,'w+')
- allPredictions = cPickle.load(open(current_loc))
- #allPredictions = current_loc
-
- print 'Loaded %d examples' % len(allPredictions)
-
- # fetch the data needed
- accessWrapper = DataAccessWrapper(settings)
- seqInfo = SeqSpliceInfo(accessWrapper,,settings['allowed_fragments'])
-
- # Uniqueness filtering using alignment score for reads with several
- # alignments
allUniquePredictions = {}
for new_prediction in allPredictions:
else:
allUniquePredictions[id] = new_prediction
- written_lines = 0
- for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
+ return allUniquePredictions
- id = current_prediction['id']
- seq = current_prediction['read']
- dna = current_prediction['dna']
+def createOutput(current_loc,out_fname,output_format):
+ """
+ Given all the predictions and a result file name. This function calculates
+ all aligmnment information and returns the result in the specified format.
+ """
- # CHECK !!!
- q1 = 'zzz'
+ out_fh = open(out_fname,'w+')
+ allPredictions = cPickle.load(open(current_loc))
+ #allPredictions = current_loc
- chromo = current_prediction['chr']
- strand = current_prediction['strand']
- start_pos = current_prediction['start_pos']
- alignment = current_prediction['alignment']
+ print 'Loaded %d examples' % len(allPredictions)
+
+ # fetch the data needed
+ accessWrapper = DataAccessWrapper(settings)
+ seqInfo = SeqSpliceInfo(accessWrapper,,settings['allowed_fragments'])
- DPScores = current_prediction['DPScores']
- predExons = current_prediction['predExons']
- predExons = [e+start_pos for e in predExons]
+ # make predictions unique
+ allUniquePredictions = getUniquePrediction(allPredictions)
- (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
- tExonSizes,tStarts, tEnds) = alignment
+ written_lines = 0
+ for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
- if len(qExonSizes) != num_exons:
- print 'BUG exon sizes %d'%id
- continue
+ if output_format == 'BLAT':
+ # BLAT output
+ new_line = createBlatOutput(current_prediction)
- EST2GFF = False
- new_line = ''
-
- if EST2GFF:
- predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
-
- match = predExons[0,1] - predExons[0,0]
- if predExons.shape == (2,2):
- match += predExons[1,1] - predExons[1,0]
-
- mismatch = 0
- repmatch = 0
- Ns = 0
- Qgapcnt = 0
- Qgapbs = 0
-
- Tgapcnt = 0
- Tgapbs = 0
-
- if predExons.shape == (2,2):
- Tgapbs += predExons[1,0] - predExons[0,1]
- Tgapcnt = 1
-
- # &strand, Qname, &Qsize,
- # &Qstart, &Qend, Tname, &Tsize,
- # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
- #
- # ATTENTION
- #
- # we enforce equal exons sizes for q and t because we are missing indel
- # positions of the alignment
-
- if qExonSizes[0] != tExonSizes[0]:
- continue
-
- Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
- Qsize = len(seq)
- Qstart = qStart
- Qend = qEnd
- Tname = 'CHR%d'%chromo
-
- start_pos -= 2
-
- Tsize = tEnd+1 + start_pos
- Tstart = tStart + start_pos
- Tend = tEnd + start_pos
- blockcnt = Tgapcnt+1
- exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
- Qstarts_str = str(qStarts)[1:-1].replace(' ','')
- Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
-
- new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
- % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
- Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
- Tname, Tsize, Tstart, Tend,\
- blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
+ elif output_format == 'ShoRe':
+ # ShoRe output
+ new_line = createShoReOutput(current_prediction)
+
+ elif output_format == 'mGene':
+ # Genefinding output for mGene
+ new_line = createGenefindingOutput(current_prediction)
else:
- #try:
- exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
- #except:
- # print 'Bug in example %d (idx: %d)' %\
- # (current_prediction['id'],pred_idx)
- # continue
-
- #t_size = tEnd - tStart
- #read_size = 38
- #if strand == '-':
- # total_size = seqInfo.chromo_sizes[chromo]
-
- # start_pos = total_size - start_pos
-
- # qExonSizes.reverse()
- # qStarts = [read_size-e for e in qEnds]
- # qStarts.reverse()
- # qEnds = [read_size-e for e in qStarts]
- # qEnds.reverse()
-
- # tExonSizes.reverse()
- # tStarts = [t_size-e for e in tEnds]
- # tStarts.reverse()
- # tEnds = [t_size-e for e in tStarts]
- # tEnds.reverse()
-
- # exonIdentities.reverse()
- # exonGaps.reverse()
-
- pp = lambda x : str(x)[1:-1].replace(' ','')
-
- new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
- (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
- pp(qExonSizes),pp(qStarts),\
- pp(qEnds),pp(tExonSizes),\
- pp(tStarts),pp(tEnds),\
- pp(exonIdentities),pp(exonGaps))
+ assert False
out_fh.write(new_line)
written_lines += 1
print 'Wrote out %d lines' % written_lines
+
+
+def createBlatOutput(current_prediction):
+ """
+ This function takes one prediction as input and returns the corresponding
+ alignment in BLAT format.
+ """
+
+ id = current_prediction['id']
+
+ seq = current_prediction['read']
+ dna = current_prediction['dna']
+
+ # we do not really need the quality values here
+ q1 = 'zzz'
+
+ chromo = current_prediction['chr']
+ strand = current_prediction['strand']
+ start_pos = current_prediction['start_pos']
+ alignment = current_prediction['alignment']
+
+ DPScores = current_prediction['DPScores']
+ predExons = current_prediction['predExons']
+ predExons = [e+start_pos for e in predExons]
+
+ (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+ tExonSizes,tStarts, tEnds) = alignment
+
+ if len(qExonSizes) != num_exons:
+ print 'BUG exon sizes %d'%id
+ continue
+
+ EST2GFF = False
+ new_line = ''
+
+ if EST2GFF:
+ predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
+
+ match = predExons[0,1] - predExons[0,0]
+ if predExons.shape == (2,2):
+ match += predExons[1,1] - predExons[1,0]
+
+ mismatch = 0
+ repmatch = 0
+ Ns = 0
+ Qgapcnt = 0
+ Qgapbs = 0
+
+ Tgapcnt = 0
+ Tgapbs = 0
+
+ if predExons.shape == (2,2):
+ Tgapbs += predExons[1,0] - predExons[0,1]
+ Tgapcnt = 1
+
+ # &strand, Qname, &Qsize,
+ # &Qstart, &Qend, Tname, &Tsize,
+ # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
+ #
+ # ATTENTION
+ #
+ # we enforce equal exons sizes for q and t because we are missing indel
+ # positions of the alignment
+
+ if qExonSizes[0] != tExonSizes[0]:
+ continue
+
+ Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
+ Qsize = len(seq)
+ Qstart = qStart
+ Qend = qEnd
+ Tname = 'CHR%d'%chromo
+
+ start_pos -= 2
+
+ Tsize = tEnd+1 + start_pos
+ Tstart = tStart + start_pos
+ Tend = tEnd + start_pos
+ blockcnt = Tgapcnt+1
+ exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
+ Qstarts_str = str(qStarts)[1:-1].replace(' ','')
+ Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
+
+ new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
+ % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
+ Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
+ Tname, Tsize, Tstart, Tend,\
+ blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
+
+ else:
+ #try:
+ exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
+ #except:
+ # print 'Bug in example %d (idx: %d)' %\
+ # (current_prediction['id'],pred_idx)
+ # continue
+
+ #t_size = tEnd - tStart
+ #read_size = 38
+ #if strand == '-':
+ # total_size = seqInfo.chromo_sizes[chromo]
+
+ # start_pos = total_size - start_pos
+
+ # qExonSizes.reverse()
+ # qStarts = [read_size-e for e in qEnds]
+ # qStarts.reverse()
+ # qEnds = [read_size-e for e in qStarts]
+ # qEnds.reverse()
+
+ # tExonSizes.reverse()
+ # tStarts = [t_size-e for e in tEnds]
+ # tStarts.reverse()
+ # tEnds = [t_size-e for e in tStarts]
+ # tEnds.reverse()
+
+ # exonIdentities.reverse()
+ # exonGaps.reverse()
+
+ pp = lambda x : str(x)[1:-1].replace(' ','')
+
+ new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+ (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+ pp(qExonSizes),pp(qStarts),\
+ pp(qEnds),pp(tExonSizes),\
+ pp(tStarts),pp(tEnds),\
+ pp(exonIdentities),pp(exonGaps))
+
+ return new_line
+
+
+def createGenefindingOutput(current_loc,out_fname):
+ """
+ """
+ pass
+
+
+def createShoReOutput(current_loc,out_fname):
+ """
+ """
+ pass
import gridtools
-from utils import get_slices,split_file
+from utils import get_slices,split_file,combine_files
+
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
+
+from qpalma_main import QPalma
jp = os.path.join
"""
After creation of jobs this function submits them to the cluster.
"""
- for current_job in self.functionJobs:
- self.sid, self.jobids = submit_jobs([current_job])
+ self.sid, self.jobids = submit_jobs(self.functionJobs)
+ #self.processedFunctionJobs = process_jobs(self.functionJobs)
def Restart(self,id):
completed successfully.
"""
+ #print 'checking whether jobs finished...'
+ #while not get_status(self.sid, self.jobids):
+ # time.sleep(7)
+ #print 'collecting jobs'
+ #retjobs = collect_jobs(self.sid, self.jobids, self.functionJobs)
+
print 'checking whether finished'
while not get_status(self.sid, self.jobids):
- time.sleep(7)
+ time.sleep(10)
+
print 'collecting jobs'
retjobs = collect_jobs(self.sid, self.jobids, self.functionJobs)
+ print "ret fields AFTER execution on cluster"
+ for (i, job) in enumerate(retjobs):
+ print "Job #", i, "- ret: ", job.ret
+
+ print '--------------'
#print "ret fields AFTER execution on cluster"
- #for (i, job) in enumerate(retjobs):
+ #for (i, job) in enumerate(self.processedFunctionJobs):
# print "Job #", i, "- ret: ", job.ret
- #print '--------------'
+
self.collectResults()
for idx in range(0,num_splits):
data_fname = jp(result_dir,'map.part_%d'%idx)
- result_fname = jp(result_dir,'map.vm.part_%d.heuristic'%idx)
+ result_fname = jp(result_dir,'map.vm.part_%d.heuristic.spliced'%idx)
self.result_files.append(result_fname)
current_job = KybJob(gridtools.ApproximationTaskStarter,[run_fname,data_fname,param_fname,result_fname,self.settings])
result_dir = self.settings['approximation_dir']
combined_fn = jp(result_dir,'map.vm.spliced')
combine_files(self.result_files,combined_fn)
- combine_files([combined_fn,settings['spliced_reads_fn']],'map.vm')
+ combine_files([combined_fn,self.settings['spliced_reads_fn']],'map.vm')
def ApproximationTaskStarter(run_fname,data_fname,param_fname,result_fname,settings):
run = cPickle.load(open(self.settings['run_fn']))
run['name'] = 'saved_run'
- param = self.settings['prediction_parameter_fn']
+ param_fn = self.settings['prediction_parameter_fn']
run['result_dir'] = self.settings['prediction_dir']
dataset_fn = self.settings['prediction_dataset_fn']
chunks.append((c_name,prediction_keys[slice[0]:slice[1]]))
for c_name,current_chunk in chunks:
- current_job = KybJob(gridtools.TaskStarter,[run,dataset_fn,current_chunk,param,c_name])
+ current_job = KybJob(gridtools.AlignmentTaskStarter,[self.settings,run,dataset_fn,current_chunk,param_fn,c_name])
current_job.h_vmem = '20.0G'
#current_job.express = 'True'
print 'Got %d job(s)' % len(self.functionJobs)
-def TaskStarter(run,dataset_fn,prediction_keys,param,set_name):
+def AlignmentTaskStarter(settings,run,dataset_fn,prediction_keys,param_fn,set_name):
"""
"""
-
- qp = QPalma()
- qp.predict(run,dataset_fn,prediction_keys,param,set_name)
-
+ accessWrapper = DataAccessWrapper(settings)
+ seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+ qp = QPalma(run,seqInfo)
+ qp.init_prediction(dataset_fn,prediction_keys,param_fn,set_name)
return 'finished prediction of set %s.' % set_name
After QPalma predicted alignments this task postprocesses the data.
"""
- def __init__(self):
- ClusterTask.__init__(self)
+ def __init__(self,settings):
+ ClusterTask.__init__(self,settings)
def CreateJobs(self):
self.result_files.append(result_fn)
- current_job = KybJob(grid_alignment.TaskStarter,[chunk_fn,result_fn])
+ current_job = KybJob(gridtools.PostProcessingTaskStarter,[chunk_fn,result_fn])
current_job.h_vmem = '15.0G'
current_job.express = 'True'
print "job #1: ", current_job.nativeSpecification
+ self.functionJobs.append(current_job)
+
def collectResults(self):
combined_fn = jp(self.result_dir,'all_alignments.align')
combine_files(self.result_files,combined_fn)
-def TaskStarter(chunk_fn,result_fn):
+def PostProcessingTaskStarter(chunk_fn,result_fn):
create_alignment_file(chunk_fn,result_fn)
ClusterTask.__init__(self)
- def TaskStarter(param):
+ def DummyTaskStarter(param):
create_alignment_file(chunk_fn,result_fn)
result_fn = jp(result_dir,'%s.align_remap'%chunk_name)
chunk_fn = jp(run_dir,chunk_fn)
- current_job = KybJob(grid_alignment.g_alignment,[chunk_fn,result_fn])
+ current_job = KybJob(grid_alignment.DummyTaskStarter,[chunk_fn,result_fn])
current_job.h_vmem = '15.0G'
current_job.express = 'True'