delete[] matrices[i];
}
-void Alignment::getAlignmentResults(int* s_align, int* e_align,
+void Alignment::getAlignmentResultsOld(int* s_align, int* e_align,
int* mmatrix_p, double* alignscores, double* qScores) {
- int idx;
+ size_t idx;
for(idx=0; idx<splice_align_size; idx++)
s_align[idx] = splice_align[idx];
//printf("Leaving getAlignmentResults...\n");
}
-void Alignment::getAlignmentArrays(int* dna_align, int* est_align) {
+void Alignment::getAlignmentArraysOld(int* dna_align, int* est_align) {
int idx;
for(idx=0; idx<result_len; idx++) {
est_align[idx] = EST_ARRAY[idx];
}
}
+
+
+PyObject* Alignment::getAlignmentResults() {
+
+ PyObject* result = PyTuple_New(5);
+ PyObject* py_splice_align = PyList_New(splice_align_size);
+ PyObject* py_est_align = PyList_New(est_align_size);
+ PyObject* py_mmatrix = PyList_New(mmatrix_param_size);
+ PyObject* py_align_scores = PyList_New(alignmentscores_size);
+ PyObject* py_q_scores = PyList_New(0);
+
+ size_t idx;
+ for(idx=0; idx<splice_align_size; idx++)
+ PyList_SetItem(py_splice_align,idx,PyInt_FromLong(splice_align[idx]));
+ //s_align[idx] = splice_align[idx];
+
+ for(idx=0; idx<est_align_size; idx++)
+ PyList_SetItem(py_est_align,idx,PyInt_FromLong(est_align[idx]));
+
+ //e_align[idx] = est_align[idx];
+
+ for(idx=0; idx<mmatrix_param_size; idx++)
+ PyList_SetItem(py_mmatrix,idx,PyInt_FromLong(mmatrix_param[idx]));
+
+ //mmatrix_p[idx] = mmatrix_param[idx];
+
+ for(idx=0; idx<alignmentscores_size; idx++)
+ PyList_SetItem(py_align_scores,idx,PyFloat_FromDouble(alignmentscores[idx]));
+
+ //alignscores[idx] = alignmentscores[idx];
+
+ if (use_quality_scores) {
+ penalty_struct currentPlif;
+ int ctr=0;
+ for (int z=0; z<nr_paths; z++) {
+ for(int estChar=1;estChar<6;estChar++) {
+ for(int dnaChar=0;dnaChar<6;dnaChar++) {
+
+ int currentPos = (estChar-1)*6+dnaChar;
+ currentPlif = qualityFeaturesAllPaths[z][currentPos];
+
+ for(int pidx=0; pidx<currentPlif.len; pidx++) {
+ //qScores[ctr] = currentPlif.penalties[pidx];
+ //printf("%f ",qScores[ctr]);
+ PyList_Append(py_q_scores,PyFloat_FromDouble(currentPlif.penalties[pidx]));
+ ctr++;
+ }
+ //printf("\n");
+ }}}
+ //printf("\nctr is %d\n",ctr);
+ }
+
+ PyTuple_SetItem(result,0,py_splice_align);
+ PyTuple_SetItem(result,1,py_est_align);
+ PyTuple_SetItem(result,2,py_mmatrix);
+ PyTuple_SetItem(result,3,py_align_scores);
+ PyTuple_SetItem(result,4,py_q_scores);
+
+ return result;
+}
+
+PyObject* Alignment::getAlignmentArrays() {
+
+ PyObject* result = PyTuple_New(2);
+ PyObject* dna_align = PyList_New(result_len);
+ PyObject* est_align = PyList_New(result_len);
+
+ int idx;
+ for(idx=0; idx<result_len; idx++) {
+ PyList_SetItem(dna_align,idx,PyInt_FromLong(DNA_ARRAY[idx]));
+ PyList_SetItem(est_align,idx,PyInt_FromLong(EST_ARRAY[idx]));
+ }
+
+ return result;
+}
jobs=[]
+ # fetch the data needed
+ g_dir = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+ acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+ don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+ g_fmt = 'chr%d.dna.flat'
+ s_fmt = 'contig_%d%s'
+
+ num_chromo = 6
+
+ accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+ seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
for c_name,current_chunk in chunks:
- current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param_fn,c_name])
+ current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param_fn,seqInfo,c_name])
current_job.h_vmem = '20.0G'
#current_job.express = 'True'
(sid, jobids) = submit_jobs(functionJobs)
-def g_predict(run,dataset_fn,prediction_keys,param_fn,set_name):
+def g_predict(run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name):
"""
"""
qp = QPalma(False)
- qp.init_prediction(run,dataset_fn,prediction_keys,param_fn,set_name)
+ qp.init_prediction(run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name)
return 'finished prediction of set %s.' % set_name
import QPalmaConfiguration as Conf
-# these two imports are needed for the load genomic resp. interval query
-# functions
-#from Genefinding import *
-#from genome_utils import load_genomic
from Utils import calc_stat, calc_info, pprint_alignment, get_alignment
class SpliceSiteException:
dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
- # splice score is located at g of ag
- #ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and dna[p-1]=='a' and dna[p]=='g' ]
- #assert ag_tuple_pos == [p for p,e in enumerate(acc_supp) if e != -inf and p > 1], pdb.set_trace()
- #gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')]
- #assert gt_tuple_pos == [p for p,e in enumerate(don_supp) if e != -inf and p > 0], pdb.set_trace()
-
original_exons = currentExons
exons = original_exons - (up_cut-1)
exons[0,0] -= 1
c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
- currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
+ currentAlignment.getAlignmentArraysOld(c_dna_array,c_est_array)
+ __dna_array,__est_array = currentAlignment.getAlignmentArrays()
dna_array = [0.0]*result_len
est_array = [0.0]*result_len
dna_array = None
est_array = None
- currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+ assert dna_array == __dna_array
+ assert est_array == __est_array
+
+ currentAlignment.getAlignmentResultsOld(c_SpliceAlign, c_EstAlign,\
c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
+ __newSpliceAlign, __newEstAlign, __newWeightMatch, __newDPScores, __newQualityPlifsFeatures =\
+ currentAlignment.getAlignmentResults()
+
newSpliceAlign = zeros((current_num_path*dna_len,1))
newEstAlign = zeros((est_len*current_num_path,1))
newWeightMatch = zeros((current_num_path*mm_len,1))
del c_qualityPlifsFeatures
del currentAlignment
+ assert newSpliceAlign = __newSpliceAlign
+ assert newEstAlign = __newEstAlign
+ assert newWeightMatch = __newWeightMatch
+ assert newDPScores = __newDPScores
+ assert newQualityPlifsFeatures = __newQualityPlifsFeatures
+
return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
newQualityPlifsFeatures, dna_array, est_array
#
###############################################################################
- def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,set_name):
+ def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name):
"""
Performing a prediction takes...
"""
self.run = run
self.set_name = set_name
- #full_working_path = os.path.join(run['alignment_dir'],run['name'])
full_working_path = run['result_dir']
print 'full_working_path is %s' % full_working_path
self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
-
# number of prediction instances
self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
# Load parameter vector to predict with
param = cPickle.load(open(param_fn))
- self.predict(run,prediction_set,param)
+ self.predict(run,prediction_set,param,seqInfo)
- def predict(self,run,prediction_set,param):
+ def predict(self,run,prediction_set,param,seqInfo):
"""
- This method...
+ This method contains the prediction loop over all reads of the given dataset.
"""
self.run = run
- if self.run['mode'] == 'normal':
- self.use_quality_scores = False
-
- elif self.run['mode'] == 'using_quality_scores':
- self.use_quality_scores = True
- else:
- assert(False)
+ self.use_quality_scores = False
+ if self.run['mode'] == 'using_quality_scores':
+ use_quality_scores = True
# Set the parameters such as limits/penalties for the Plifs
[h,d,a,mmatrix,qualityPlifs] =\
if not self.qpalma_debug_mode:
self.plog('Starting prediction...\n')
- donSP = self.run['numDonSuppPoints']
- accSP = self.run['numAccSuppPoints']
- lengthSP = self.run['numLengthSuppPoints']
- mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
- numq = self.run['numQualSuppPoints']
- totalQualSP = self.run['totalQualSuppPoints']
-
- totalQualityPenalties = zeros((totalQualSP,1))
+ #donSP = self.run['numDonSuppPoints']
+ #accSP = self.run['numAccSuppPoints']
+ #lengthSP = self.run['numLengthSuppPoints']
+ #mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
+ #numq = self.run['numQualSuppPoints']
+ #totalQualSP = self.run['totalQualSuppPoints']
+ #totalQualityPenalties = zeros((totalQualSP,1))
problem_ctr = 0
# we take the first quality vector of the tuple of quality vectors
quality_index = 0
- # fetch the data needed
- g_dir = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
- acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
- don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
-
- g_fmt = 'chr%d.dna.flat'
- s_fmt = 'contig_%d%s'
-
- num_chromo = 6
-
- accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
- seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
-
# beginning of the prediction loop
for example_key in prediction_set.keys():
- print 'Current example %d' % example_key
-
for example in prediction_set[example_key]:
currentSeqInfo,read,currentQualities = example
if not self.qpalma_debug_mode:
self.plog('Loading example id: %d...\n'% int(id))
- if run['mode'] == 'normal':
- quality = [40]*len(read)
-
- if run['mode'] == 'using_quality_scores':
+ if use_quality_scores = True
quality = currentQualities[quality_index]
-
- if not run['enable_quality_scores']:
+ else:
quality = [40]*len(read)
try:
continue
if not run['enable_splice_signals']:
- for idx,elem in enumerate(currentDon):
- if elem != -inf:
+ for idx in range(len(currentDon)):
+ if currentDon[idx] != -inf:
currentDon[idx] = 0.0
- for idx,elem in enumerate(currentAcc):
- if elem != -inf:
+ if currentAcc[idx] != -inf:
currentAcc[idx] = 0.0
current_prediction = self.calc_alignment(currentDNASeq, read,\