currentMode = NORMAL;
// dnaest
- double *DNA_ARRAY = 0;
- double *EST_ARRAY = 0;
+ DNA_ARRAY = 0;
+ EST_ARRAY = 0;
//possible donor positions
int nr_donor_sites = 0 ;
// printf("%f ",pidx,p.penalties[pidx]);
// printf("\n");
//}
+ //
+ if(z==0) {
- //if(DNA_ARRAY != 0) {
- // delete[] DNA_ARRAY;
- // delete[] EST_ARRAY;
- // }
-
- //DNA_ARRAY = new double[result_length];
- //EST_ARRAY = new double[result_length];
-
- // //backtracking
- // int i = max_score_positions[2*z] ; //i (est)
- // int j = max_score_positions[2*z +1] ; //j (dna)
- // int prev_i = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_i;
- // int prev_j = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_j;
- // int prev_m_no = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_matrix_no;
- //
- // for (int ii=result_length; ii>0; ii--) {
- // if ((prev_i == (i-1)) && (prev_j == (j-1))) { // match or mismatch
- //DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
- //EST_ARRAY[ii-1] = check_char(est[i-1]) ;
- // }
- // else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST_ARRAY
- //DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
- //EST_ARRAY[ii-1] = 0 ; //gap
- // }
- // else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA_ARRAY
- //DNA_ARRAY[ii-1] = 0 ; //gap
- //EST_ARRAY[ii-1] = check_char(est[i-1]) ;
- // }
- // else {// splice site
- //for (j; j > prev_j; j--) {
- // DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
- // EST_ARRAY[ii-1] = 6 ; //intron
- // ii-- ;
- //}
- //ii++ ; // last ii-- too much (because done in for-loop)
- // }
- //
- // i = prev_i;
- // j = prev_j;
- // prev_i = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_i;
- // prev_j = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_j;
- // prev_m_no = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_matrix_no;
- // }
- //
- } //end of z
+ if(DNA_ARRAY != 0) {
+ delete[] DNA_ARRAY;
+ delete[] EST_ARRAY;
+ }
+
+ result_len = result_length;
+
+ DNA_ARRAY = new int[result_length];
+ EST_ARRAY = new int[result_length];
+
+ //backtracking
+ int i = max_score_positions[2*z] ; //i (est)
+ int j = max_score_positions[2*z +1] ; //j (dna)
+ int prev_i = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_i;
+ int prev_j = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_j;
+ int prev_m_no = ((Pre_score*)matrices[z] + i*dna_len +j)->prev_matrix_no;
+
+ for (int ii=result_length; ii>0; ii--) {
+ if ((prev_i == (i-1)) && (prev_j == (j-1))) { // match or mismatch
+ DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
+ EST_ARRAY[ii-1] = check_char(est[i-1]) ;
+ }
+ else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST_ARRAY
+ DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
+ EST_ARRAY[ii-1] = 0 ; //gap
+ }
+ else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA_ARRAY
+ DNA_ARRAY[ii-1] = 0 ; //gap
+ EST_ARRAY[ii-1] = check_char(est[i-1]) ;
+ }
+ else {// splice site
+ for (j; j > prev_j; j--) {
+ DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
+ EST_ARRAY[ii-1] = 6 ; //intron
+ ii-- ;
+ }
+ ii++ ; // last ii-- too much (because done in for-loop)
+ }
+
+ i = prev_i;
+ j = prev_j;
+ prev_i = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_i;
+ prev_j = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_j;
+ prev_m_no = ((Pre_score*)matrices[prev_m_no] + i*dna_len + j)->prev_matrix_no;
+ }
- //if(DNA_ARRAY != 0) {
- // delete[] DNA_ARRAY;
- // delete[] EST_ARRAY;
- // }
+ } // end of "if z == 0"
+
+ } //end of z
for (int i=nr_paths-1;i>=0; i--)
delete[] matrices[i];
//printf("\nctr is %d\n",ctr);
//printf("Leaving getAlignmentResults...\n");
}
+
+void Alignment::getAlignmentArrays(int* dna_align, int* est_align) {
+
+ int idx;
+ for(idx=0; idx<result_len; idx++) {
+ dna_align[idx] = DNA_ARRAY[idx];
+ est_align[idx] = EST_ARRAY[idx];
+ }
+}
from qpalma.tools.splicesites import getDonAccScores
from qpalma.Configuration import *
+from compile_dataset import compile_d
+
+import palma.palma_utils as pu
+from palma.output_formating import print_results
+
class para:
pass
def __init__(self):
self.ARGS = Param()
- #from compile_dataset import compile_d
#compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,False)
-
- from compile_dataset import compile_d
- compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,True)
+ #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,True)
#gen_file= '%s/genome.config' % self.ARGS.basedir
#ginfo_filename = 'genome_info.pickle'
else:
assert(False)
+ print 'leaving constructor...'
+
def plog(self,string):
self.logfh.write(string)
self.logfh.flush()
# Set the parameters such as limits penalties for the Plifs
[h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
- pdb.set_trace()
+ #pdb.set_trace()
+
+ lengthSP = Conf.numLengthSuppPoints
+ donSP = Conf.numDonSuppPoints
+ accSP = Conf.numAccSuppPoints
+ mmatrixSP = Conf.sizeMatchmatrix[0]\
+ *Conf.sizeMatchmatrix[1]
+ numq = Conf.numQualSuppPoints
+ totalQualSP = Conf.totalQualSuppPoints
# Initialize solver
if Conf.USE_OPT:
self.plog('Initializing problem...\n')
solver = SIQPSolver(Conf.numFeatures,numExamples,Conf.C,self.logfh)
+ solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
+ solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
# stores the number of alignments done for each example (best path, second-best path etc.)
num_path = [anzpath]*numExamples
#############################################################################################
self.plog('Starting training...\n')
- donSP = Conf.numDonSuppPoints
- accSP = Conf.numAccSuppPoints
- lengthSP = Conf.numLengthSuppPoints
- mmatrixSP = Conf.sizeMatchmatrix[0]\
- *Conf.sizeMatchmatrix[1]
- numq = Conf.numQualSuppPoints
- totalQualSP = Conf.totalQualSuppPoints
-
currentPhi = zeros((Conf.numFeatures,1))
totalQualityPenalties = zeros((totalQualSP,1))
AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
# Check wether scalar product + loss equals viterbi score
- #print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
distinct_scores = False
if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
distinct_scores = True
+ # Check wether scalar product + loss equals viterbi score
if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
pdb.set_trace()
#
# call solver every nth example //added constraint
- if exampleIdx != 0 and exampleIdx % 100 == 0 and Conf.USE_OPT:
+ if exampleIdx != 0 and exampleIdx % Conf.numConstraintsPerRound == 0 and Conf.USE_OPT:
objValue,w,self.slacks = solver.solve()
if math.fabs(objValue - self.oldObjValue) <= 1e-6:
sum_xis = 0
for elem in self.slacks:
sum_xis += elem
+
+ print 'sum of slacks is %f'% sum_xis
for i in range(len(param)):
param[i] = w[i]
c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
+ result_len = currentAlignment.getResultLength()
+ c_dna_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
+ c_est_array = QPalmaDP.createIntArrayFromList([0]*(result_len))
+
+ currentAlignment.getAlignmentArrays(c_dna_array,c_est_array)
+
c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
true_map[0] = 1
pathNr = 0
+ dna_array = [0.0]*result_len
+ est_array = [0.0]*result_len
+
+ for r_idx in range(result_len):
+ dna_array[r_idx] = c_dna_array[r_idx]
+ est_array[r_idx] = c_est_array[r_idx]
+
+ _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
+ _newEstAlign = newEstAlign.flatten().tolist()[0]
+
+ #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
+ #self.plog(line1+'\n')
+ #self.plog(line2+'\n')
+ #self.plog(line3+'\n')
+
weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
true_map[pathNr+1] = 1
- e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+ e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos,exampleIdx)
if e1_b_off != None:
exon1Begin.append(e1_b_off)
else:
allWrongExons.append((newExons,exons))
+
+ logfile = self.logfh
+
+ if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0 and e2_e_off == 0:
+ print >> logfile, 'example %d correct' % exampleIdx
+ else:
+ print >> logfile, 'example %d wrong' % exampleIdx
+
e1Begin_pos,e1Begin_neg,e1End_pos,e1End_neg,mean_e1Begin_neg,mean_e1End_neg = self.evaluatePositions(exon1Begin,exon1End)
e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = self.evaluatePositions(exon2Begin,exon2End)
- print 'Total num. of Examples: %d' % numExamples
+ all_pos_correct = 0
+ for idx in range(len(exon1Begin)):
+ if exon1Begin[idx] == 0 and exon1End[idx] == 0\
+ and exon2Begin[idx] == 0 and exon2End[idx] == 0:
+ all_pos_correct += 1
+
+ logfile = self.logfh
+ print >> logfile, 'Total num. of examples: %d' % numExamples
+ print >> logfile, 'Number of total correct examples: %d' % all_pos_correct
+ print >> logfile, 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
+ print >> logfile, 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
+ print >> logfile, 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
+
+ print 'Total num. of examples: %d' % numExamples
+ print 'Number of total correct examples: %d' % all_pos_correct
print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
self.logfh.close()
def evaluatePositions(self,eBegin,eEnd):
+
eBegin_pos = [elem for elem in eBegin if elem == 0]
eBegin_neg = [elem for elem in eBegin if elem != 0]
eEnd_pos = [elem for elem in eEnd if elem == 0]
return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
+ def evaluateExample(self,dna,est,exons,SpliceAlign,newEstAlign,spos,exampleIdx):
+ newExons = []
+ oldElem = -1
+ SpliceAlign = SpliceAlign.flatten().tolist()[0]
+ SpliceAlign.append(-1)
+ for pos,elem in enumerate(SpliceAlign):
+ if pos == 0:
+ oldElem = -1
+ else:
+ oldElem = SpliceAlign[pos-1]
+
+ if oldElem != 0 and elem == 0: # start of exon
+ newExons.append(pos)
+
+ if oldElem == 0 and elem != 0: # end of exon
+ newExons.append(pos)
+
+ e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
+
+ self.plog(("Example %d"%exampleIdx) + str(exons) + " "+ str(newExons) + "\n")
+ self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
+
+ if len(newExons) == 4:
+ e1_begin,e1_end = newExons[0],newExons[1]
+ e2_begin,e2_end = newExons[2],newExons[3]
+ else:
+ return None,None,None,None,newExons
+
+ e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
+ e1_e_off = int(math.fabs(e1_end - exons[0,1]))
+
+ e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
+ e2_e_off = int(math.fabs(e2_end - exons[1,1]))
+
+ return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
def fetch_genome_info(ginfo_filename):
if not os.path.exists(ginfo_filename):
else:
return cPickle.load(open(ginfo_filename))
-def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
- newExons = []
- oldElem = -1
- SpliceAlign = SpliceAlign.flatten().tolist()[0]
- SpliceAlign.append(-1)
- for pos,elem in enumerate(SpliceAlign):
- if pos == 0:
- oldElem = -1
- else:
- oldElem = SpliceAlign[pos-1]
-
- if oldElem != 0 and elem == 0: # start of exon
- newExons.append(pos)
-
- if oldElem == 0 and elem != 0: # end of exon
- newExons.append(pos)
-
- e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
-
- if len(newExons) == 4:
- e1_begin,e1_end = newExons[0],newExons[1]
- e2_begin,e2_end = newExons[2],newExons[3]
- else:
- return None,None,None,None,newExons
-
- e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
- e1_e_off = int(math.fabs(e1_end - exons[0,1]))
-
- e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
- e2_e_off = int(math.fabs(e2_end - exons[1,1]))
-
- return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
def calcStat(Acceptor,Donor,Exons):
maxDon = max(max(don),maxDon)
minDon = min(min(don),minDon)
+def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
+ (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
+ pu.get_splice_info(_newSpliceAlign,_newEstAlign)
+
+ t_strand = '+'
+ translation = '-ACGTN_' #how aligned est and dna sequence is displayed
+ #(gap_char, 5 nucleotides, intron_char)
+ comparison_char = '| ' #first: match_char, second: intron_char
+
+ (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
+ pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
+
+ first_identity = None
+ last_identity = None
+ for i in range(len(comparison)):
+ if comparison[i] == '|' and first_identity is None:
+ first_identity = i
+ if comparison[-i] == '|' and last_identity is None:
+ last_identity = len(comparison) - i - 1
+
+ try:
+ for idx in range(len(dna_array)):
+ dna_array[idx] = translation[int(dna_array[idx])]
+ est_array[idx] = translation[int(est_array[idx])]
+ except:
+ pdb.set_trace()
+
+ line1 = "".join(dna_array)
+ line2 = comparison
+ line3 = "".join(est_array)
+
+ return line1,line2,line3
+
+ #exon_length[-1] = exon_length[-1] - tEnd + tIDX[last_identity] + 1
+
+ #tStart = tIDX[first_identity]
+ #tStarts[0] = tIDX[first_identity]
+ #tEnd = tIDX[last_identity]
+ #tEnds[-1] = tIDX[last_identity]
+ #qStart = qIDX[first_identity]
+ #qStarts[0] = qIDX[first_identity]
+ #qEnd = qIDX[last_identity]
+ #qEnds[-1] = qIDX[last_identity]
+
+ #align_info.qStart = qStart
+ #align_info.qEnd = qEnd
+ #align_info.tStart = tStart
+ #align_info.tEnd = tEnd
+ #align_info.num_exons = num_exons
+ #align_info.qExonSizes = qExonSizes
+ #align_info.qStarts = qStarts
+ #align_info.qEnds = qEnds
+ #align_info.tExonSizes =tExonSizes
+ #align_info.tStarts = tStarts
+ #align_info.tEnds = tEnds
+ #align_info.exon_length = exon_length
+ #align_info.identity = identity
+ #align_info.ssQuality = ssQuality
+ #align_info.exonQuality = exonQuality
+ #align_info.comparison = comparison
+ #align_info.qIDX = qIDX
+ #align_info.tIDX = tIDX
+ #align_info.dna_letters = dna_letters
+ #align_info.est_letters = est_letters
+
if __name__ == '__main__':
qpalma = QPalma()