import cPickle
import sys
+import pydb
+import pdb
import os
import os.path
def evaluatePositions(eBegin,eEnd):
-
eBegin_pos = [elem for elem in eBegin if elem == 0]
eBegin_neg = [elem for elem in eBegin if elem != 0]
eEnd_pos = [elem for elem in eEnd if elem == 0]
return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
+
def perform_prediction(current_dir,run_name):
#cmd = 'echo ./doPrediction.sh %s | qsub -l h_vmem=1.0G -cwd -j y -N \"%s.log\"'%(current_dir,run_name)
cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
os.system(cmd)
#print cmd
+
def forall_experiments(current_func,tl_dir):
"""
Given the toplevel directoy this function calls for each subdir the
- prediction function of each found qpalma training instance.
-
- Once this is done we can collect the results
+ function given as first argument
"""
dir_entries = os.listdir(tl_dir)
for current_dir in run_dirs:
run_name = current_dir.split('/')[-1]
- #print current_dir
-
current_func(current_dir,run_name)
+
def collect_prediction(current_dir,run_name):
"""
Given the toplevel directoy this function calls for each subdir the
-
"""
train_suffix = '_allPredictions_TRAIN'
jp = os.path.join
filename = jp(current_dir,run_name)+train_suffix
- allTrainPredictions = cPickle.load(open(filename))
+ print 'Prediction on: %s' % filename
+ prediction_on(filename)
+
+ filename = jp(current_dir,run_name)+test_suffix
+ print 'Prediction on: %s' % filename
+ prediction_on(filename)
+
+
+def prediction_on(filename):
+ allPredictions = cPickle.load(open(filename))
exon1Begin = []
exon1End = []
exon2Begin = []
exon2End = []
allWrongExons = []
-
- for current_pred in allTrainPredictions:
- e1_b_off = current_pred['e1_b_off']
- e1_e_off = current_pred['e1_e_off']
- e2_b_off = current_pred['e2_b_off']
- e2_e_off = current_pred['e2_e_off']
-
- if e1_b_off != None:
- exon1Begin.append(e1_b_off)
- exon1End.append(e1_e_off)
- exon2Begin.append(e2_b_off)
- exon2End.append(e2_e_off)
- else:
- pass
- #allWrongExons.append((newExons,exons))
-
- #if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0 and e2_e_off == 0:
- # print >> logfile, 'example %d correct' % exampleIdx
- #else:
- # print >> logfile, 'example %d wrong' % exampleIdx
-
- e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = evaluatePositions(exon1Begin,exon1End)
- e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = evaluatePositions(exon2Begin,exon2End)
+ allDoubleScores = []
+
+ for current_example_pred in allPredictions:
+ ambigous_match = False
+ if len(current_example_pred) > 1:
+ ambigous_match = True
+ example_scores = []
+
+ for current_pred in current_example_pred:
+ e1_b_off = current_pred['e1_b_off']
+ e1_e_off = current_pred['e1_e_off']
+ e2_b_off = current_pred['e2_b_off']
+ e2_e_off = current_pred['e2_e_off']
+
+ if e1_b_off != None:
+ exon1Begin.append(e1_b_off)
+ exon1End.append(e1_e_off)
+ exon2Begin.append(e2_b_off)
+ exon2End.append(e2_e_off)
+ else:
+ pass
+ #allWrongExons.append((newExons,exons))
+
+ if ambigous_match == True:
+ current_score = current_pred['DPScores'][0]
+ example_scores.append(current_score)
+
+ e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = evaluatePositions(exon1Begin,exon1End)
+ e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = evaluatePositions(exon2Begin,exon2End)
+
+ allDoubleScores.append(example_scores)
all_pos_correct = 0
for idx in range(len(exon1Begin)):
and exon2Begin[idx] == 0 and exon2End[idx] == 0:
all_pos_correct += 1
-
- print 'Total num. of examples: %d' % len(allTrainPredictions)
+ print 'Total num. of examples: %d' % len(allPredictions)
print 'Number of total correct examples: %d' % all_pos_correct
print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
- #filename = jp(current_dir,run_name)+test_suffix
- #allTestPredictions = cPickle.load(open(filename))
-
- #current_prediction = {'e1_b_off':e1_b_off,'e1_e_off':e1_e_off,
- #'e2_b_off':e2_b_off ,'e2_e_off':e2_e_off,\
- #'predExons':newExons, 'trueExons':exons,\
- #'dna':dna, 'est':est,\
- #'DPScores':newDPScores }
-
-
if __name__ == '__main__':
dir = sys.argv[1]
assert os.path.exists(dir), 'Error directory does not exist!'
alternative_seq = process_remapped_reads(reReads,currentFRead,dna_flat_files)
+ #check_remapped_reads(reReads,currentFRead,dna_flat_files)
+
if seq == '':
continue
OriginalEsts.append(original_est)
Qualities.append(qual)
UpCut.append(up_cut)
- #SplitPositions.append(spos)
- #Ids.append(id)
- #FReads.append(currentFRead)
-
AlternativeSequences.append(alternative_seq)
instance_counter += 1
return alternativeSeq
+def check_remapped_reads(reReads,fRead,dna_flat_files):
+ """
+ For all matches found by Vmatch we calculate the fragment of the DNA we
+ would like to perform an aligment during prediction.
+ """
+
+ fragment_size = 3000
+
+ seq = fRead['seq']
+ strand = fRead['strand']
+ pos = fRead['p_start']
+ chr = fRead['chr']
+
+ print 'fRead:\t%d %d %d %d' %\
+ (fRead['p_start'],fRead['exon_stop'],fRead['exon_start'],fRead['p_stop'])
+
+ for currentRRead in reReads:
+ rId = currentRRead['id']
+
+ pos1 = currentRRead['pos1']
+ chr1 = currentRRead['chr1']
+ seq1 = currentRRead['seq1']
+
+ seq2 = currentRRead['seq2']
+ pos2 = 0
+
+ if not seq2 == '':
+ pos2 = currentRRead['pos2']
+ chr2 = currentRRead['chr2']
+ s2_pos = seq.find(seq2)
+
+ print 'reRead:\t %d %d %d %d'%(pos1,pos1+len(seq1),pos2,pos2+len(seq2))
+
def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
"""
if elem != -inf:
acc_supp[idx] = 0.0
- # the code up to here is equal for all alternative positions
+ current_example_predictions = []
+
+ # first make a prediction on the dna fragment which comes from the ground truth
+ current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
+ current_prediction['exampleIdx'] = exampleIdx
+
+ current_example_predictions.append(current_prediction)
+
+ # then make predictions for all dna fragments that where occurring in
+ # the vmatch results
for alternative_alignment in currentAlternatives:
chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel = alternative_alignment
currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
current_prediction['exampleIdx'] = exampleIdx
- current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
- current_prediction['exampleIdx'] = exampleIdx
+ current_example_predictions.append(current_prediction)
- allPredictions.append(current_prediction)
+ allPredictions.append(current_example_predictions)
# end of the prediction loop we save all predictions in a pickle file and exit
cPickle.dump(allPredictions,open('%s_allPredictions_%s'%(run['name'],set_flag),'w+'))