myalign([nr_paths], dna, est, {h}, matchmatrix, donor, acceptor, remove_duplicate_scores, ...
print_matrix) */
-/* nlhs - Number of expected mxArrays (Left Hand Side)
- plhs - Array of pointers to expected outputs
- nrhs - Number of inputs (Right Hand Side)
- prhs - Array of pointers to input data. */
-
Alignment::Alignment() {
len = 0;
limits = 0;
printf("Entering myalign...\n");
mlen = 6; // score matrix: length of 6 for "- A C G T N"
- dna_len = dna_len_p;
- est_len = est_len_p;
+ dna_len = dna_len_p + 1 ;
+ est_len = est_len_p + 1 ;
nr_paths = nr_paths_p;
-#if 0
- /***************************************************************************/
- // 3. h (cell array)
- /***************************************************************************/
- mwSize anz_func = 1;
-
- functions = read_penalty_struct_from_cell(prhs[arg], anz_func);
- //std::cout << " bla " << functions->max_len << " \n";
- //std::cout << "lookup_penalty(5) = " << lookup_penalty(functions, 5, NULL, 0)<<std::endl ;
- arg++; // arg -> 4
-
-
- /***************************************************************************/
- // 4. match matrix (columnwise)
- /***************************************************************************/
- int nr_entries = mxGetN(prhs[arg]) * mxGetM(prhs[arg]);
- //mexPrintf("\nNumber of entries in matchmatrix: %d\n", nr_entries) ;
- double *matchmatrix;
- matchmatrix = mxGetPr(prhs[arg]);
- //mexPrintf("matchmatrix\n") ;
- //for (int i=0; i< nr_entries; i++) {
- // mexPrintf("entry %d: <%f>\n", i, matchmatrix[i]);
- //}
- arg++; // arg -> 5
-#endif
-
/***************************************************************************/
// initialize alignment matrices and call fill_matrix()
/***************************************************************************/
printf("before init of splice_align and rest...\n");
- splice_align = new int[(dna_len)*nr_paths];
- est_align = new int[(est_len)*nr_paths];
+ splice_align = new int[(dna_len-1)*nr_paths];
+ est_align = new int[(est_len-1)*nr_paths];
mmatrix_param = new int[(mlen*mlen)*nr_paths];
alignmentscores = new double[nr_paths]; //alignment score for each path/matrix
- printf("Lengths are %d %d %d...\n",dna_len,est_len,mlen);
printf("before memset...\n");
- memset((char*)splice_align, -1, (dna_len)*nr_paths*sizeof(int)); // fills splice_align with zeros
- memset((char*)est_align, -1, (est_len)*nr_paths*sizeof(int)); // fills est_align with zeros
+ memset((char*)splice_align, -1, (dna_len-1)*nr_paths*sizeof(int)); // fills splice_align with zeros
+ memset((char*)est_align, -1, (est_len-1)*nr_paths*sizeof(int)); // fills est_align with zeros
memset((char*)mmatrix_param, 0, mlen*mlen*nr_paths*sizeof(int)); //fills mmatrix_param with zeros
memset(alignmentscores, -1, nr_paths*sizeof(double)); //fills alignmentscores with zeros
for (int z=0; z<nr_paths; z++) {
result_length = 0 ;
- int* s_align = splice_align + (dna_len)*z; //pointer
- int* e_align = est_align + (est_len)*z ; //pointer
+ int* s_align = splice_align + (dna_len-1)*z; //pointer
+ int* e_align = est_align + (est_len-1)*z ; //pointer
int* mparam = mmatrix_param + (mlen*mlen)*z; //pointer
bool no_more_path = result_align(matrices, z, est_len, dna_len, &result_length, est, dna, s_align, e_align, mparam, alignmentscores, max_score_positions);
from numpy import inf
from numpy.matlib import zeros
+import pdb
-# 3 * 30 supporting points: x_1 ... x_30
-# => 3 * 30 parameters (1 .. 90): y_1 ... y_30
+# 3 * 30 supporting points: x_1 ... x_30 => 3 * 30 parameters (1 .. 90): y_1 ... y_30
# piecewise linear function:
-#
# take score from SVM vektor (don_supp: case 1, acc_supp: case 2) and compute length of intron: case 3
# these are our values x
#
for k in range(len(scores)):
value = scores[k]
Lower = len([elem for elem in plf.limits if elem <= value])
+ # because we count from 0 in python
+ Lower -= 1
Upper = Lower+1 ; # x-werte bleiben fest
- #print Lower,Upper,len(plf.limits)
+ print value,Lower,Upper
if Lower == 0:
currentWeight[0] += 1
currentWeight[Upper] = currentWeight[Upper] + weightup
currentWeight[Lower] = currentWeight[Lower] + weightlow
- #print value,plf.limits[Lower],plf.limits[Upper]
- #print weightup,weightlow,currentWeight[Upper],currentWeight[Lower]
+ print plf.limits[Lower],plf.limits[Upper]
+ print weightup,weightlow,currentWeight[Upper],currentWeight[Lower]
return currentWeight
self.plog('Initializing problem...\n')
- def createParameterVector(self, trueWeightDon, trueWeightAcc, trueWeightIntron, trueWeightMatch, qualityMatrix):
- currentW = zeros((self.numFeatures,1))
-
- donS = self.numDonSuppPoints
- accS = self.numAccSuppPoints
- lenS = self.numLengthSuppPoints
- matS = self.sizeMMatrix
- qualS = self.numQualSuppPoints
-
- currentW[0:donS,0] = trueWeightDon[:,0]
- currentW[donS:donS+accS,0] = trueWeightAcc[:,0]
- currentW[donS+accS:donS+accS+lenS,0] = trueWeightIntron[:,0]
- currentW[donS+accS+lenS:donS+accS+lenS+matS,0] = trueWeightMatch[:,0]
- currentW[donS+accS+lenS+matS:donS+accS+lenS+matS+qualS,0] = qualityMatrix[:,0]
-
- return currentW
-
-
def plog(self,string):
self.logfh.write(string)
def run(self):
-
# Load the whole dataset
Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
+ #Sequences, Acceptors, Donors, Exons, Ests, QualityScores = paths_load_data('training',self.genome_info,self.ARGS)
+
# number of training instances
N = len(Sequences)
self.numExamples = N
Donors[i] = 1
# Initialize solver
- solver = SIQPSolver(self.numFeatures,self.numExamples,self.C,self.logfh)
+ if not __debug__:
+ solver = SIQPSolver(self.numFeatures,self.numExamples,self.C,self.logfh)
# stores the number of alignments done for each example (best path, second-best path etc.)
num_path = [anzpath]*N
# Calculate the weights
trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-
- #trueWeight = self.createParameterVector(trueWeightDon, trueWeightAcc, trueWeightIntron, trueWeightMatch, qualityMatrix)
trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, qualityMatrix ])
- #currentPhi = createFeatureVector()
-
currentPhi = zeros((self.numFeatures,1))
currentPhi[0:30] = mat(d.penalties[:]).reshape(30,1)
currentPhi[30:60] = mat(a.penalties[:]).reshape(30,1)
newWeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
newAlignmentScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
+ pdb.set_trace()
+
currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores)
spliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
weightMatch = zeros((num_path[exampleIdx]*mm_len,1))
+ print 'spliceAlign'
for i in range(dna_len*num_path[exampleIdx]):
spliceAlign[i] = newSpliceAlign[i]
+ print '%f' % (spliceAlign[i])
+ print 'weightMatch'
for i in range(mm_len*num_path[exampleIdx]):
weightMatch[i] = newWeightMatch[i]
+ print '%f' % (weightMatch[i])
for i in range(num_path[exampleIdx]):
AlignmentScores[i+1] = newAlignmentScores[i]
+
+ print AlignmentScores
spliceAlign = spliceAlign.reshape(num_path[exampleIdx],dna_len)
weightMatch = weightMatch.reshape(num_path[exampleIdx],mm_len)
-
- #pdb.set_trace()
- #print spliceAlign[:,:]
- #print weightMatch[:,:]
- #print AlignmentScores
-
# Calculate weights of the respective alignments Note that we are
# calculating n-best alignments without any hamming loss, so we
# have to keep track which of the n-best alignments correspond to
path_loss[pathNr+1] += 1
# Gewichte in restliche Zeilen der Matrix speichern
- # Weights[pathNr,:] = [weightIntron, weightDon, weightAcc, weightMatch(pathNr,:)]
wp = numpy.vstack([weightIntron, weightDon, weightAcc, weightMatch[pathNr,:].T, qualityMatrix ])
allWeights[:,pathNr+1] = wp
- #print allWeights[:,0]
- #print allWeights[:,1]
- #print allWeights[:,2]
-
hpen = mat(h.penalties).reshape(len(h.penalties),1)
dpen = mat(d.penalties).reshape(len(d.penalties),1)
apen = mat(a.penalties).reshape(len(a.penalties),1)
features = numpy.vstack([hpen , dpen , apen , mmatrix[:]])
-
AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
# Check wether scalar product + loss equals viterbi score
if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
true_map[pathNr+1] = 1
- # # the true label sequence should not have a larger score than the
- # # maximal one WHYYYYY?
+ # the true label sequence should not have a larger score than the maximal one WHYYYYY?
- # # this means that all n-best paths are to close to each other
- # # we have to extend the n-best search to a (n+1)-best
+ # this means that all n-best paths are to close to each other
+ # we have to extend the n-best search to a (n+1)-best
if len([elem for elem in true_map if elem == 1]) == len(true_map):
num_path[exampleIdx] = num_path[exampleIdx]+1
if elem == 0:
firstFalseIdx = map_idx
break
-
+
# if there is at least one useful false alignment add the
# corresponding constraints to the optimization problem
if firstFalseIdx != -1:
trueWeights = allWeights[:,0]
firstFalseWeights = allWeights[:,firstFalseIdx]
- # if size(Weights,1)>1 & sum((Weights(1,:)-Weights(2,:)).*param)+xis[exampleIdx]<1+column_eps,
- # e=zeros(1,N) ;
- # e[exampleIdx] = 1 ;
- # A=[A; Weights(2,:)-Weights(1,:) zeros(1,126) -e] ;
- # b=[b; -1] ;
- # gap[exampleIdx] = 1-sum((Weights(1,:)-Weights(2,:)).*param)-xis[exampleIdx] ;
- # else
- # gap[exampleIdx] = 0 ;
- # end ;
-
- # LMM.py code:
- deltas = firstFalseWeights - trueWeights
- const_added = solver.addConstraint(deltas, exampleIdx)
-
- objValue,w,self.slacks = solver.solve()
-
- sum_xis = 0
- for elem in self.slacks:
- sum_xis += elem
+ # LMM.py code:
+ deltas = firstFalseWeights - trueWeights
+ if not __debug__:
+ const_added = solver.addConstraint(deltas, exampleIdx)
+ objValue,w,self.slacks = solver.solve()
+
+ sum_xis = 0
+ for elem in self.slacks:
+ sum_xis += elem
if exampleIdx==0:
break
for j = 1:length(testrun)
currentPred = testrun(j);
- if strcmp(currentEntry.chr,currentPred.chr) && currentEntry.is_alt_spliced
+ if ~strcmp(currentEntry.chr,currentPred.chr) || currentEntry.is_alt_spliced
continue;
end
%disp(sprintf('estIdx %i ',estIdx));
numberOfExons = size(currentExons{estIdx},1);
- for exonIdx = 1:numberOfExons
- %disp(sprintf('exonIdx %i ',exonIdx));
- exonStart = currentExons{estIdx}(exonIdx,1);
- exonStop = currentExons{estIdx}(exonIdx,2);
+ for exonIdx = 1:(numberOfExons-1)
+ intronStart = currentExons{estIdx}(exonIdx,2);
+ intronStop = currentExons{estIdx}(exonIdx+1,1);
for predEstIdx = 1:numberOfPredEsts
- %disp(sprintf('predEstIdx %i ',predEstIdx));
numberOfPredExons = size(currentPredExons{predEstIdx},1);
currentESTName = currentPred.transcripts{predEstIdx};
%disp('\n');
%%disp(sprintf('%i %i %i %i %i %i\n',i,j,estIdx,predEstIdx,exonIdx,predExonIdx));
-
- if exonStart >= predExonStart && exonStart <= predExonStop
- %%disp('Overlapping');
- fprintf(fh,sprintf('%s before %d\n',currentESTName,exonStart-predExonStart))
+
+ % est is covering full intron
+ if intronStart >= predExonStart && intronStop <= predExonStop
+ fprintf(fh,sprintf(' %s is completely overlapping from %d %d\n',currentESTName,intronStart,intronStop))
+ % est is nested inside intron
+ if intronStart < predExonStart && intronStop > predExonStop
+ fprintf(fh,sprintf(' %s is completely inside intron from %d %d\n',currentESTName,predExonStart,predExonStop))
% end of exonth is nested inside predExoniction
- elseif exonStop >= predExonStart && exonStop <= predExonStop
- %%disp('Overlapping');
- fprintf(fh,sprintf('%s after %d\n',currentESTName,predExonStop-exonStop))
+ elseif intronStart >= predExonStart && predExonStop >= intronStart && intronStop >= predExonStop
+ fprintf(fh,sprintf('%s is upstream overlapping from %d %d\n',currentESTName,intronStart,predExonStop))
% predExoniction is nested inside exonth
- elseif exonStart <= predExonStart && predExonStop <= exonStop
- %disp('Overlapping');
+ elseif intronStart <= predExonStart && predExonStart <= intronStop && intronStop <= predExonStop
+ fprintf(fh,sprintf('%s is downstream overlapping from %d %d\n',currentESTName,predExonStart,intronStop))
fprintf('%s %d %d',fh)
else
d=1;