#
###############################################################################
-import qpalma.Configuration as Conf
+import QPalmaConfiguration as Conf
from Run import *
import pdb
import os
import math
import resource
-from qpalma.DataProc import *
from qpalma.computeSpliceWeights import *
from qpalma.set_param_palma import *
from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
-from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf
-
-from qpalma.Configuration import *
from numpy.matlib import mat,zeros,ones,inf
from numpy import inf,mean
from ParaParser import *
-from Lookup import *
+from Lookup import Lookup
-from qpalma.sequence_utils import *
+from qpalma.sequence_utils import reverse_complement,unbracket_seq
strand = strand_map[strand]
- #if strand == '-':
- # continue
if not chr in range(1,6):
continue
- unb_seq = unbracket_est(seq)
+ unb_seq = unbracket_seq(seq)
+
+ # forgot to do this
+ if strand == '-':
+ unb_seq = reverse_complement(unb_seq)
+
effective_len = len(unb_seq)
genomicSeq_start = pos
start = cpu()
#currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+
stop = cpu()
self.get_time += stop-start
cal_prb = location['cal_prb']
original_est = original_est.lower()
- est = unbracket_est(original_est)
+ est = unbracket_seq(original_est)
effective_len = len(est)
genomicSeq_start = pos - self.max_intron_size
# -*- coding: utf-8 -*-
import pdb
+import os.path
from numpy.matlib import mat,zeros,ones,inf
##########
-def split_file_join_results(filename,parts):
+def split_file(filename,result_dir,parts):
+ jp = os.path.join
+
all_intervals = []
print 'counting lines'
end = begin+part
params = (begin,end)
- print begin,end
-
all_intervals.append(params)
- pdb.set_trace()
-
- infile = open(filename,'r')
-
parts_fn = []
for pos,params in enumerate(all_intervals):
beg,end = params
- print params
- out_fn = '%s.part_%d'%(filename,pos)
- parts_fn.append(out_fn)
- #out_fh = open(out_fn,'w+')
+ out_fn = jp(result_dir,'map.part_%d'%pos)
print out_fn
+ parts_fn.append(out_fn)
+ out_fh = open(out_fn,'w+')
parts_fn.reverse()
all_intervals.reverse()
params = all_intervals.pop()
beg,end = params
out_fn = parts_fn.pop()
+ out_fh.close()
out_fh = open(out_fn,'w+')
out_fh.close()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+#
+#
+#
+#
+
import os.path
import cPickle
-import qpalma.Configuration as Conf
+import QPalmaConfiguration as Conf
def check_vmatch_params(Config):
# The purpose of this script is to check the created dataset pickle files for
# consistency before doing any kind of training/prediction on the data.
#
-#
import sys
import pdb
import cPickle
-from compile_dataset import get_seq_and_scores
+from qpalma.sequence_utils import get_seq_and_scores,unbracket_seq,reverse_complement
def checkAll(filename):
+ """
+ This function loads the dataset and performs some sanity checks and
+ assertions to be sure that the set is in the right shape for QPalma to train
+ resp. to predict on.
+ """
+
dataset = cPickle.load(open(filename))
- for idx,(current_key,current_value) in enumerate(dataset.iteritems()):
- (currentSeqInfo,original_est,currentQualities) = current_value
- currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
- currentQualities = (prb,cal_prb,chastity)
+ # we take the first quality vector of the tuple of quality vectors
+ quality_index = 0
+
+ status = True
+ mes = '---'
- if idx > 0 and idx % 250 == 0:
- print 'processed %d examples' % idx
+ idx = 0
+ for example_key in dataset.keys():
+ matches = dataset[example_key]
+ print 'Current example %d has %d matches' % (example_key,len(matches))
- currentInfo = SeqInfo[idx]
- chr,strand,p1,p2 = currentInfo
- currentExon = Exons[idx]
- currentExon[0,1] += 1
- currentExon -= 1
+ for example in matches:
+ (currentSeqInfo,original_est,currentQualities) = example
- currentEst = OriginalEsts[idx]
- originalEst = OriginalEsts[idx]
+ (id,chromo,strand,genomicSeq_start,genomicSeq_stop) = currentSeqInfo
- if currentEst.find('[') != -1:
- #print currentEst
- currentEst = unbracket_est(currentEst)
- #print currentEst
+ assert chromo in range(1,6), pdb.set_trace()
+ assert strand in ['+','-'], pdb.set_trace()
- assert len(currentEst) == 36, pdb.set_trace()
+ quality = currentQualities[quality_index]
- first_seq = get_seq( currentExon[0,0], currentExon[0,1], True )
- end = first_seq[-2:]
- first_seq = first_seq[:-2]
- second_seq = get_seq( currentExon[1,0], currentExon[1,1]+1, False )
- begin = second_seq[:2]
- second_seq = second_seq[2:]
+ # check for key consistency
+ assert id == example_key
- dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
- seq, acc, don =\
- get_seq_and_scores(chr,strand,currentExon[0,0],currentExon[1,1]+1,dna_flat_files)
+ if idx > 0 and idx % 1000 == 0:
+ print 'processed %d examples' % idx
- assert (len(first_seq) + len(second_seq)) == 36, pdb.set_trace()
+ dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
- if not (end == 'gt' or end == 'gc'):
- print 'invalid donor in example %d'% idx
- print SeqInfo[idx]
- print currentExon
+ genomic_seq_pos,acc_pos,don_pos = get_seq_and_scores(chromo,'+',genomicSeq_start,genomicSeq_stop,dna_flat_files,False)
+ genomic_seq_neg,acc_neg,don_neg = get_seq_and_scores(chromo,'-',genomicSeq_start,genomicSeq_stop,dna_flat_files,False)
- #invalid_donor_ctr += 1
- #continue
+ assert reverse_complement(genomic_seq_neg) == genomic_seq_pos
- if not (begin == 'ag'):
- print 'invalid acceptor in example %d'% idx
- print SeqInfo[idx]
- print currentExon
- pdb.set_trace()
+ return status,mes
if __name__ == '__main__':
We reconstruct the exact alignment to infer the positions of indels and the
sizes of the respective exons.
"""
+ translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
SpliceAlign = current_prediction['spliceAlign']
estAlign = current_prediction['estAlign']
dna_array = current_prediction['dna_array']
- est_array = current_prediction['est_array']
+ read_array = current_prediction['read_array']
- dna_array = "".join(map(lambda x: str(x),dna_array))
- est_array = "".join(map(lambda x: str(x),est_array))
+ dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
+ read_array = "".join(map(lambda x: translation[int(x)],read_array))
- translation = '-ACGTN_' #how aligned est and dna sequence is displayed
- #(gap_char, 5 nucleotides, intron_char)
-
- # this array hold a number for each exons indicating its number of matches
+ # this array holds a number for each exon indicating its number of matches
exonIdentities = [0]*num_exons
- exonGaps = [0]*num_exons
-
- est_array = map(lambda x: translation[int(x)],est_array)
- dna_array = map(lambda x: translation[int(x)],dna_array)
+ exonGaps = [0]*num_exons
- gaps = 0
+ gaps = 0
identity = 0
- idx = 0
+ idx = 0
- #if num_exons > 1:
- # pdb.set_trace()
-
- for ridx in range(len(est_array)):
- read_elem = est_array[ridx]
+ for ridx in range(len(read_array)):
+ read_elem = read_array[ridx]
dna_elem = dna_array[ridx]
- if ridx > 0 and est_array[ridx-1] != '_' and est_array[ridx] == '_':
+ if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
idx += 1
gaps = 0
identity = 0
def create_alignment_file(current_loc,out_fname):
- import numpy
- #qparser.parse_reads(filtered_reads)
+ print 'Working on %s'%current_loc
out_fh = open(out_fname,'w+')
-
allPredictions = cPickle.load(open(current_loc))
#allPredictions = current_loc
- allPositions = {}
+ print 'Loaded %d examples' % len(allPredictions)
- for current_prediction in allPredictions:
+ written_lines = 0
+ for pred_idx,current_prediction in enumerate(allPredictions):
id = current_prediction['id']
#current_ground_truth = qparser.fetch_read(id)
#seq = current_ground_truth['seq']
#q1 = current_ground_truth['prb']
- seq = current_prediction['est']
+ seq = current_prediction['read']
dna = current_prediction['dna']
# CHECK !!!
(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
tExonSizes,tStarts, tEnds) = alignment
+ if len(qExonSizes) != num_exons:
+ print 'BUG exon sizes %d'%id
+ continue
+
EST2GFF = False
new_line = ''
else:
- num_exons = len(qExonSizes)
- exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
+ try:
+ exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
+ except:
+ print 'Bug in example %d (idx: %d)' %\
+ (current_prediction['id'],pred_idx)
+ continue
pp = lambda x : str(x)[1:-1].replace(' ','')
new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
(id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
- str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
- str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
- str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''),\
- str(exonIdentities)[1:-1].replace(' ',''),str(exonGaps)[1:-1].replace(' ',''))
+ pp(qExonSizes),pp(qStarts),\
+ pp(qEnds),pp(tExonSizes),\
+ pp(tStarts),pp(tEnds),\
+ pp(exonIdentities),pp(exonGaps))
+ written_lines += 1
out_fh.write(new_line)
- return allPositions
-
+ print 'Wrote out %d lines' % written_lines
if __name__ == '__main__':
current_dir = sys.argv[1]
import cPickle
import sys
+import time
import pdb
import os
import os.path
import math
-from pythongrid import Job, KybJob, MethodJob, processJobs, Usage, processJobsLocally
+from pythongrid import KybJob, Usage
+from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
from createAlignmentFileFromPrediction import *
import grid_alignment
-from Utils import split_file_join_results
-
-
def g_alignment(chunk_fn,result_fn):
create_alignment_file(chunk_fn,result_fn)
jp = os.path.join
#run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
- run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
-
- data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+ #run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
+ #run_dir = '/fml/ag-raetsch/home/fabio/tmp/sandbox/alignment'
+ #data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+ #data_dir = '/fml/ag-raetsch/home/fabio/tmp/sandbox/alignment'
+
+ run_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
+ data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
+
+ #run_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
+ #data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
chunks_fn = []
for fn in os.listdir(run_dir):
if fn.startswith('chunk'):
chunks_fn.append(fn)
-
- chunks_fn = [\
- 'chunk_17.predictions.pickle',\
- 'chunk_18.predictions.pickle',\
- 'chunk_19.predictions.pickle',\
- 'chunk_20.predictions.pickle',\
- 'chunk_21.predictions.pickle',\
- 'chunk_23.predictions.pickle',\
- 'chunk_24.predictions.pickle'
- ]
+ #chunks_fn = [\
+ #'chunk_0.predictions.pickle',\
+ #'chunk_4.predictions.pickle']
print chunks_fn
-
functionJobs=[]
for chunk_fn in chunks_fn:
current_job = KybJob(grid_alignment.g_alignment,[chunk_fn,result_fn])
current_job.h_vmem = '15.0G'
- #current_job.express = 'True'
+ current_job.express = 'True'
print "job #1: ", current_job.nativeSpecification
- functionJobs.append(current_job)
-
- processedFunctionJobs = processJobs(functionJobs)
- print "ret fields AFTER execution on cluster"
- for (i, job) in enumerate(processedFunctionJobs):
- print "Job with id: ", i, "- ret: ", job.ret
+ functionJobs = [current_job]
+ #functionJobs.append(current_job)
+ (sid, jobids) = submit_jobs(functionJobs)
+ time.sleep(10)
+ #break
if __name__ == '__main__':
- #split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)
create_and_submit()
import cPickle
import sys
import pdb
+import time
import os
import os.path
import math
-from pythongrid import Job, KybJob, MethodJob, processJobs, Usage, processJobsLocally
+from pythongrid import KybJob, Usage
+from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
from PipelineHeuristic import *
import grid_heuristic
-from Utils import split_file_join_results
+from Utils import split_file
+
def g_heuristic(run_fname,data_fname,param_fname,result_fname):
#print run_fname,data_fname,param_fname,result_fname
ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_fname)
ph1.filter()
-
return 'finished filtering set %s.' % data_fname
+
def create_and_submit():
jp = os.path.join
run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
- data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+ data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_0'
run_fname = jp(run_dir,'run_obj.pickle')
- data_fname = jp(data_dir,'map.vm')
+ original_map_fname = '/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/main/map.vm'
+ #split_file(original_map_fname,data_dir,25)
+
param_fname = jp(run_dir,'param_526.pickle')
functionJobs=[]
- for idx in range(10):
- data_fname = jp(data_dir,'map.vm.part_%d'%idx)
+ for idx in range(3,25):
+ data_fname = jp(data_dir,'map.part_%d'%idx)
result_fname = jp(data_dir,'map.vm.part_%d.heuristic'%idx)
+ #pdb.set_trace()
+
current_job = KybJob(grid_heuristic.g_heuristic,[run_fname,data_fname,param_fname,result_fname])
current_job.h_vmem = '25.0G'
- #current_job.express = 'True'
+ current_job.express = 'True'
print "job #1: ", current_job.nativeSpecification
functionJobs.append(current_job)
- processedFunctionJobs = processJobs(functionJobs)
- print "ret fields AFTER execution on cluster"
- for (i, job) in enumerate(processedFunctionJobs):
- print "Job with id: ", i, "- ret: ", job.ret
+ (sid, jobids) = submit_jobs(functionJobs)
+ #print 'checking whether finished'
+ #while not get_status(sid, jobids):
+ # time.sleep(7)
+ #print 'collecting jobs'
+ #retjobs = collect_jobs(sid, jobids, functionJobs)
+ #print "ret fields AFTER execution on cluster"
+ #for (i, job) in enumerate(retjobs):
+ # print "Job #", i, "- ret: ", job.ret
+
+ #print '--------------'
if __name__ == '__main__':
import cPickle
import sys
+import time
import pdb
import os
import os.path
import math
-from pythongrid import Job, KybJob, MethodJob, processJobs, Usage, processJobsLocally
+from pythongrid import KybJob, Usage
+from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
from qpalma_main import *
for c_name,current_chunk in chunks:
current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param,c_name])
- current_job.h_vmem = '30.0G'
+ current_job.h_vmem = '20.0G'
#current_job.express = 'True'
print "job #1: ", current_job.nativeSpecification
param = cPickle.load(open(jp(run_dir,'param_526.pickle')))
- dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.pickle'
- prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.keys.pickle'
+ #dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/sandbox/dataset_neg_strand_testcase.pickle'
+ #prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/sandbox/dataset_neg_strand_testcase.keys.pickle'
+
+ #dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.pickle'
+ #prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.keys.pickle'
+
+ #run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
+ #dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/dataset_run_1.pickle.pickle'
+ #prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/dataset_run_1.pickle.keys.pickle'
+
+ run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
+ dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/dataset_run_2.pickle.pickle'
+ prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/dataset_run_2.pickle.keys.pickle'
prediction_keys = cPickle.load(open(prediction_keys_fn))
print 'Found %d keys for prediction.' % len(prediction_keys)
num_splits = 25
+ #num_splits = 1
slices = get_slices(len(prediction_keys),num_splits)
chunks = []
for idx,slice in enumerate(slices):
- c_name = 'chunk_%d' % idx
- chunks.append((c_name,prediction_keys[slice[0]:slice[1]]))
+ #if idx != 0:
+ c_name = 'chunk_%d' % idx
+ chunks.append((c_name,prediction_keys[slice[0]:slice[1]]))
functionJobs = makeJobs(run,dataset_fn,chunks,param)
for size in [len(elem) for name,elem in chunks]:
sum += size
- assert sum == len(prediction_keys)
+ #assert sum == len(prediction_keys)
print 'Got %d job(s)' % len(functionJobs)
- print "output ret field in each job before sending it onto the cluster"
- for (i, job) in enumerate(functionJobs):
- print "Job with id: ", i, "- ret: ", job.ret
+ #print "output ret field in each job before sending it onto the cluster"
+ #for (i, job) in enumerate(functionJobs):
+ # print "Job with id: ", i, "- ret: ", job.ret
+
+ #print ""
+ #print "sending function jobs to cluster"
+ #print ""
+
+ #pdb.set_trace()
- print ""
- print "sending function jobs to cluster"
- print ""
+ (sid, jobids) = submit_jobs(functionJobs)
- processedFunctionJobs = processJobs(functionJobs)
+ #print 'checking whether finished'
+ #while not get_status(sid, jobids):
+ # time.sleep(7)
+ #print 'collecting jobs'
+ #retjobs = collect_jobs(sid, jobids, functionJobs)
+ #print "ret fields AFTER execution on cluster"
+ #for (i, job) in enumerate(retjobs):
+ # print "Job #", i, "- ret: ", job.ret
- print "ret fields AFTER execution on cluster"
- for (i, job) in enumerate(processedFunctionJobs):
- print "Job with id: ", i, "- ret: ", job.ret
+ #print '--------------'
def g_predict(run,dataset_fn,prediction_keys,param,set_name):
import sys
import cPickle
import pdb
-import re
import os.path
+import array
-from qpalma.sequence_utils import *
+from qpalma.sequence_utils import get_seq_and_scores,reverse_complement,unbracket_seq
import numpy
from numpy.matlib import mat,zeros,ones,inf
#from qpalma.SIQP_CPX import SIQPSolver
#from qpalma.SIQP_CVXOPT import SIQPSolver
-import array
-from qpalma.DataProc import *
from qpalma.computeSpliceWeights import *
from qpalma.set_param_palma import *
from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf
+from qpalma.Plif import Plf,compute_donacc
-from qpalma.Configuration import *
+import QPalmaConfiguration as Conf
-# this two imports are needed for the load genomic resp. interval query
+# these two imports are needed for the load genomic resp. interval query
# functions
-from Genefinding import *
-from genome_utils import load_genomic
+#from Genefinding import *
+#from genome_utils import load_genomic
from Utils import calc_stat, calc_info, pprint_alignment, get_alignment
class SpliceSiteException:
dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
# splice score is located at g of ag
- ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and dna[p-1]=='a' and dna[p]=='g' ]
- assert ag_tuple_pos == [p for p,e in enumerate(acc_supp) if e != -inf and p > 1], pdb.set_trace()
+ #ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and dna[p-1]=='a' and dna[p]=='g' ]
+ #assert ag_tuple_pos == [p for p,e in enumerate(acc_supp) if e != -inf and p > 1], pdb.set_trace()
+ #gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')]
+ #assert gt_tuple_pos == [p for p,e in enumerate(don_supp) if e != -inf and p > 0], pdb.set_trace()
- gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')]
- assert gt_tuple_pos == [p for p,e in enumerate(don_supp) if e != -inf and p > 0], pdb.set_trace()
-
- #original_exons = Exons[exampleIdx]
original_exons = currentExons
exons = original_exons - (up_cut-1)
exons[0,0] -= 1
# calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
currentAlignment.myalign( current_num_path, dna, dna_len,\
est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
- acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
- print_matrix)
+ acceptor, a_len, c_qualityPlifs, run['remove_duplicate_scores'],\
+ run['print_matrix'] )
c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*current_num_path))
c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*current_num_path))
"""
self.run = run
- full_working_path = os.path.join(run['alignment_dir'],run['name'])
+ #full_working_path = os.path.join(run['alignment_dir'],run['name'])
+ full_working_path = run['result_dir']
print 'full_working_path is %s' % full_working_path
# we do not need the full dataset anymore
del dataset
- # Set the parameters such as limits penalties for the Plifs
+ # Set the parameters such as limits/penalties for the Plifs
[h,d,a,mmatrix,qualityPlifs] =\
set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
#############################################################################################
# Prediction
#############################################################################################
+
self.plog('Starting prediction...\n')
donSP = self.run['numDonSuppPoints']
# where we store the predictions
allPredictions = []
+ # we take the first quality vector of the tuple of quality vectors
+ quality_index = 0
+
# beginning of the prediction loop
for example_key in prediction_set.keys():
print 'Current example %d' % example_key
for example in prediction_set[example_key]:
- currentSeqInfo,original_est,currentQualities = example
+ currentSeqInfo,original_read,currentQualities = example
- id,chr,strand,genomicSeq_start,genomicSeq_stop =\
+ id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
currentSeqInfo
- assert id == example_key
+ # remove brackets from the original read annotation
+ read = unbracket_seq(original_read)
- if not chr in range(1,6):
- continue
+ if strand == '-':
+ read = reverse_complement(read)
self.plog('Loading example id: %d...\n'% int(id))
- est = original_est
- est = unbracket_est(est)
-
if run['mode'] == 'normal':
- quality = [40]*len(est)
+ quality = [40]*len(read)
if run['mode'] == 'using_quality_scores':
- quality = currentQualities[0]
+ quality = currentQualities[quality_index]
if not run['enable_quality_scores']:
- quality = [40]*len(est)
-
- current_example_predictions = []
+ quality = [40]*len(read)
try:
- currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+ currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
except:
problem_ctr += 1
continue
if elem != -inf:
currentAcc[idx] = 0.0
- current_prediction = self.calc_alignment(currentDNASeq, est,\
+ current_prediction = self.calc_alignment(currentDNASeq, read,\
quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
current_prediction['id'] = id
#current_prediction['start_pos'] = up_cut
current_prediction['start_pos'] = genomicSeq_start
- current_prediction['chr'] = chr
+ current_prediction['chr'] = chromo
current_prediction['strand'] = strand
allPredictions.append(current_prediction)
self.logfh.close()
- def calc_alignment(self, dna, est, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
+ def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
"""
Given two sequences and the parameters we calculate on alignment
"""
donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
dna = str(dna)
- est = str(est)
+ read = str(read)
- if '-' in est:
+ if '-' in read:
self.plog('found gap\n')
- est = est.replace('-','')
- assert len(est) == Conf.read_size
+ read = read.replace('-','')
+ assert len(read) == Conf.read_size
dna_len = len(dna)
- est_len = len(est)
+ read_len = len(read)
ps = h.convert2SWIG()
newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
- newQualityPlifsFeatures, dna_array, est_array =\
- self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
+ newQualityPlifsFeatures, dna_array, read_array =\
+ self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
mm_len = run['matchmatrixRows']*run['matchmatrixCols']
_newSpliceAlign = array.array('B',newSpliceAlign.flatten().tolist()[0])
_newEstAlign = array.array('B',newEstAlign.flatten().tolist()[0])
- alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
+ alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
dna_array = array.array('B',dna_array)
- est_array = array.array('B',est_array)
+ read_array = array.array('B',read_array)
#line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
#self.plog(line1+'\n')
newExons = self.calculatePredictedExons(newSpliceAlign)
- current_prediction = {'predExons':newExons, 'dna':dna, 'est':est, 'DPScores':newDPScores,\
+ current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
'alignment':alignment,\
'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
- 'dna_array':dna_array, 'est_array':est_array }
+ 'dna_array':dna_array, 'read_array':read_array }
return current_prediction
return newExons
+
###########################
# A simple command line
# interface