From: fabio Date: Mon, 10 Dec 2007 12:14:34 +0000 (+0000) Subject: + fixed some index bugs X-Git-Url: http://git.tuebingen.mpg.de/?p=qpalma.git;a=commitdiff_plain;h=801812459ebe331bbba7ae686129248923040d8e + fixed some index bugs + added debugging output git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@6961 e1793c9e-67f9-0310-80fc-b846ff1f7b36 --- diff --git a/QPalmaDP/qpalma_dp.cpp b/QPalmaDP/qpalma_dp.cpp index 0474ded..4c33425 100644 --- a/QPalmaDP/qpalma_dp.cpp +++ b/QPalmaDP/qpalma_dp.cpp @@ -6,11 +6,6 @@ using namespace std; myalign([nr_paths], dna, est, {h}, matchmatrix, donor, acceptor, remove_duplicate_scores, ... print_matrix) */ -/* nlhs - Number of expected mxArrays (Left Hand Side) - plhs - Array of pointers to expected outputs - nrhs - Number of inputs (Right Hand Side) - prhs - Array of pointers to input data. */ - Alignment::Alignment() { len = 0; limits = 0; @@ -45,36 +40,10 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est, printf("Entering myalign...\n"); mlen = 6; // score matrix: length of 6 for "- A C G T N" - dna_len = dna_len_p; - est_len = est_len_p; + dna_len = dna_len_p + 1 ; + est_len = est_len_p + 1 ; nr_paths = nr_paths_p; -#if 0 - /***************************************************************************/ - // 3. h (cell array) - /***************************************************************************/ - mwSize anz_func = 1; - - functions = read_penalty_struct_from_cell(prhs[arg], anz_func); - //std::cout << " bla " << functions->max_len << " \n"; - //std::cout << "lookup_penalty(5) = " << lookup_penalty(functions, 5, NULL, 0)< 4 - - - /***************************************************************************/ - // 4. match matrix (columnwise) - /***************************************************************************/ - int nr_entries = mxGetN(prhs[arg]) * mxGetM(prhs[arg]); - //mexPrintf("\nNumber of entries in matchmatrix: %d\n", nr_entries) ; - double *matchmatrix; - matchmatrix = mxGetPr(prhs[arg]); - //mexPrintf("matchmatrix\n") ; - //for (int i=0; i< nr_entries; i++) { - // mexPrintf("entry %d: <%f>\n", i, matchmatrix[i]); - //} - arg++; // arg -> 5 -#endif - /***************************************************************************/ // initialize alignment matrices and call fill_matrix() /***************************************************************************/ @@ -115,16 +84,15 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est, printf("before init of splice_align and rest...\n"); - splice_align = new int[(dna_len)*nr_paths]; - est_align = new int[(est_len)*nr_paths]; + splice_align = new int[(dna_len-1)*nr_paths]; + est_align = new int[(est_len-1)*nr_paths]; mmatrix_param = new int[(mlen*mlen)*nr_paths]; alignmentscores = new double[nr_paths]; //alignment score for each path/matrix - printf("Lengths are %d %d %d...\n",dna_len,est_len,mlen); printf("before memset...\n"); - memset((char*)splice_align, -1, (dna_len)*nr_paths*sizeof(int)); // fills splice_align with zeros - memset((char*)est_align, -1, (est_len)*nr_paths*sizeof(int)); // fills est_align with zeros + memset((char*)splice_align, -1, (dna_len-1)*nr_paths*sizeof(int)); // fills splice_align with zeros + memset((char*)est_align, -1, (est_len-1)*nr_paths*sizeof(int)); // fills est_align with zeros memset((char*)mmatrix_param, 0, mlen*mlen*nr_paths*sizeof(int)); //fills mmatrix_param with zeros memset(alignmentscores, -1, nr_paths*sizeof(double)); //fills alignmentscores with zeros @@ -133,8 +101,8 @@ void Alignment::myalign(int nr_paths_p, char* dna, int dna_len_p, char* est, for (int z=0; z 3 * 30 parameters (1 .. 90): y_1 ... y_30 +# 3 * 30 supporting points: x_1 ... x_30 => 3 * 30 parameters (1 .. 90): y_1 ... y_30 # piecewise linear function: -# # take score from SVM vektor (don_supp: case 1, acc_supp: case 2) and compute length of intron: case 3 # these are our values x # @@ -27,9 +26,11 @@ def calculateWeights(plf, scores): for k in range(len(scores)): value = scores[k] Lower = len([elem for elem in plf.limits if elem <= value]) + # because we count from 0 in python + Lower -= 1 Upper = Lower+1 ; # x-werte bleiben fest - #print Lower,Upper,len(plf.limits) + print value,Lower,Upper if Lower == 0: currentWeight[0] += 1 @@ -41,8 +42,8 @@ def calculateWeights(plf, scores): currentWeight[Upper] = currentWeight[Upper] + weightup currentWeight[Lower] = currentWeight[Lower] + weightlow - #print value,plf.limits[Lower],plf.limits[Upper] - #print weightup,weightlow,currentWeight[Upper],currentWeight[Lower] + print plf.limits[Lower],plf.limits[Upper] + print weightup,weightlow,currentWeight[Upper],currentWeight[Lower] return currentWeight diff --git a/python/qpalma.py b/python/qpalma.py index d974747..33ac32d 100644 --- a/python/qpalma.py +++ b/python/qpalma.py @@ -71,33 +71,16 @@ class QPalma: self.plog('Initializing problem...\n') - def createParameterVector(self, trueWeightDon, trueWeightAcc, trueWeightIntron, trueWeightMatch, qualityMatrix): - currentW = zeros((self.numFeatures,1)) - - donS = self.numDonSuppPoints - accS = self.numAccSuppPoints - lenS = self.numLengthSuppPoints - matS = self.sizeMMatrix - qualS = self.numQualSuppPoints - - currentW[0:donS,0] = trueWeightDon[:,0] - currentW[donS:donS+accS,0] = trueWeightAcc[:,0] - currentW[donS+accS:donS+accS+lenS,0] = trueWeightIntron[:,0] - currentW[donS+accS+lenS:donS+accS+lenS+matS,0] = trueWeightMatch[:,0] - currentW[donS+accS+lenS+matS:donS+accS+lenS+matS+qualS,0] = qualityMatrix[:,0] - - return currentW - - def plog(self,string): self.logfh.write(string) def run(self): - # Load the whole dataset Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS) + #Sequences, Acceptors, Donors, Exons, Ests, QualityScores = paths_load_data('training',self.genome_info,self.ARGS) + # number of training instances N = len(Sequences) self.numExamples = N @@ -128,7 +111,8 @@ class QPalma: Donors[i] = 1 # Initialize solver - solver = SIQPSolver(self.numFeatures,self.numExamples,self.C,self.logfh) + if not __debug__: + solver = SIQPSolver(self.numFeatures,self.numExamples,self.C,self.logfh) # stores the number of alignments done for each example (best path, second-best path etc.) num_path = [anzpath]*N @@ -164,12 +148,8 @@ class QPalma: # Calculate the weights trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp) - - #trueWeight = self.createParameterVector(trueWeightDon, trueWeightAcc, trueWeightIntron, trueWeightMatch, qualityMatrix) trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, qualityMatrix ]) - #currentPhi = createFeatureVector() - currentPhi = zeros((self.numFeatures,1)) currentPhi[0:30] = mat(d.penalties[:]).reshape(30,1) currentPhi[30:60] = mat(a.penalties[:]).reshape(30,1) @@ -229,28 +209,30 @@ class QPalma: newWeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx])) newAlignmentScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx]) + pdb.set_trace() + currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores) spliceAlign = zeros((num_path[exampleIdx]*dna_len,1)) weightMatch = zeros((num_path[exampleIdx]*mm_len,1)) + print 'spliceAlign' for i in range(dna_len*num_path[exampleIdx]): spliceAlign[i] = newSpliceAlign[i] + print '%f' % (spliceAlign[i]) + print 'weightMatch' for i in range(mm_len*num_path[exampleIdx]): weightMatch[i] = newWeightMatch[i] + print '%f' % (weightMatch[i]) for i in range(num_path[exampleIdx]): AlignmentScores[i+1] = newAlignmentScores[i] + + print AlignmentScores spliceAlign = spliceAlign.reshape(num_path[exampleIdx],dna_len) weightMatch = weightMatch.reshape(num_path[exampleIdx],mm_len) - - #pdb.set_trace() - #print spliceAlign[:,:] - #print weightMatch[:,:] - #print AlignmentScores - # Calculate weights of the respective alignments Note that we are # calculating n-best alignments without any hamming loss, so we # have to keep track which of the n-best alignments correspond to @@ -274,20 +256,14 @@ class QPalma: path_loss[pathNr+1] += 1 # Gewichte in restliche Zeilen der Matrix speichern - # Weights[pathNr,:] = [weightIntron, weightDon, weightAcc, weightMatch(pathNr,:)] wp = numpy.vstack([weightIntron, weightDon, weightAcc, weightMatch[pathNr,:].T, qualityMatrix ]) allWeights[:,pathNr+1] = wp - #print allWeights[:,0] - #print allWeights[:,1] - #print allWeights[:,2] - hpen = mat(h.penalties).reshape(len(h.penalties),1) dpen = mat(d.penalties).reshape(len(d.penalties),1) apen = mat(a.penalties).reshape(len(a.penalties),1) features = numpy.vstack([hpen , dpen , apen , mmatrix[:]]) - AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0] # Check wether scalar product + loss equals viterbi score @@ -299,11 +275,10 @@ class QPalma: if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5: true_map[pathNr+1] = 1 - # # the true label sequence should not have a larger score than the - # # maximal one WHYYYYY? + # the true label sequence should not have a larger score than the maximal one WHYYYYY? - # # this means that all n-best paths are to close to each other - # # we have to extend the n-best search to a (n+1)-best + # this means that all n-best paths are to close to each other + # we have to extend the n-best search to a (n+1)-best if len([elem for elem in true_map if elem == 1]) == len(true_map): num_path[exampleIdx] = num_path[exampleIdx]+1 @@ -313,32 +288,22 @@ class QPalma: if elem == 0: firstFalseIdx = map_idx break - + # if there is at least one useful false alignment add the # corresponding constraints to the optimization problem if firstFalseIdx != -1: trueWeights = allWeights[:,0] firstFalseWeights = allWeights[:,firstFalseIdx] - # if size(Weights,1)>1 & sum((Weights(1,:)-Weights(2,:)).*param)+xis[exampleIdx]<1+column_eps, - # e=zeros(1,N) ; - # e[exampleIdx] = 1 ; - # A=[A; Weights(2,:)-Weights(1,:) zeros(1,126) -e] ; - # b=[b; -1] ; - # gap[exampleIdx] = 1-sum((Weights(1,:)-Weights(2,:)).*param)-xis[exampleIdx] ; - # else - # gap[exampleIdx] = 0 ; - # end ; - - # LMM.py code: - deltas = firstFalseWeights - trueWeights - const_added = solver.addConstraint(deltas, exampleIdx) - - objValue,w,self.slacks = solver.solve() - - sum_xis = 0 - for elem in self.slacks: - sum_xis += elem + # LMM.py code: + deltas = firstFalseWeights - trueWeights + if not __debug__: + const_added = solver.addConstraint(deltas, exampleIdx) + objValue,w,self.slacks = solver.solve() + + sum_xis = 0 + for elem in self.slacks: + sum_xis += elem if exampleIdx==0: break diff --git a/tools/calculateAlignmentQuality.m b/tools/calculateAlignmentQuality.m index 84bc4d4..0338a4b 100644 --- a/tools/calculateAlignmentQuality.m +++ b/tools/calculateAlignmentQuality.m @@ -19,7 +19,7 @@ for i = 1:length(ground_truth) for j = 1:length(testrun) currentPred = testrun(j); - if strcmp(currentEntry.chr,currentPred.chr) && currentEntry.is_alt_spliced + if ~strcmp(currentEntry.chr,currentPred.chr) || currentEntry.is_alt_spliced continue; end @@ -41,13 +41,11 @@ for i = 1:length(ground_truth) %disp(sprintf('estIdx %i ',estIdx)); numberOfExons = size(currentExons{estIdx},1); - for exonIdx = 1:numberOfExons - %disp(sprintf('exonIdx %i ',exonIdx)); - exonStart = currentExons{estIdx}(exonIdx,1); - exonStop = currentExons{estIdx}(exonIdx,2); + for exonIdx = 1:(numberOfExons-1) + intronStart = currentExons{estIdx}(exonIdx,2); + intronStop = currentExons{estIdx}(exonIdx+1,1); for predEstIdx = 1:numberOfPredEsts - %disp(sprintf('predEstIdx %i ',predEstIdx)); numberOfPredExons = size(currentPredExons{predEstIdx},1); currentESTName = currentPred.transcripts{predEstIdx}; @@ -58,17 +56,19 @@ for i = 1:length(ground_truth) %disp('\n'); %%disp(sprintf('%i %i %i %i %i %i\n',i,j,estIdx,predEstIdx,exonIdx,predExonIdx)); - - if exonStart >= predExonStart && exonStart <= predExonStop - %%disp('Overlapping'); - fprintf(fh,sprintf('%s before %d\n',currentESTName,exonStart-predExonStart)) + + % est is covering full intron + if intronStart >= predExonStart && intronStop <= predExonStop + fprintf(fh,sprintf(' %s is completely overlapping from %d %d\n',currentESTName,intronStart,intronStop)) + % est is nested inside intron + if intronStart < predExonStart && intronStop > predExonStop + fprintf(fh,sprintf(' %s is completely inside intron from %d %d\n',currentESTName,predExonStart,predExonStop)) % end of exonth is nested inside predExoniction - elseif exonStop >= predExonStart && exonStop <= predExonStop - %%disp('Overlapping'); - fprintf(fh,sprintf('%s after %d\n',currentESTName,predExonStop-exonStop)) + elseif intronStart >= predExonStart && predExonStop >= intronStart && intronStop >= predExonStop + fprintf(fh,sprintf('%s is upstream overlapping from %d %d\n',currentESTName,intronStart,predExonStop)) % predExoniction is nested inside exonth - elseif exonStart <= predExonStart && predExonStop <= exonStop - %disp('Overlapping'); + elseif intronStart <= predExonStart && predExonStart <= intronStop && intronStop <= predExonStop + fprintf(fh,sprintf('%s is downstream overlapping from %d %d\n',currentESTName,predExonStart,intronStop)) fprintf('%s %d %d',fh) else d=1;