from numpy import inf
from numpy.matlib import zeros
-def calculateWeights(plf, scores):
- """
- 3 * 30 supporting points: x_1 ... x_30
- => 3 * 30 parameters (1 .. 90): y_1 ... y_30
- piecewise linear function:
-
- take score from SVM vektor (don_supp: case 1, acc_supp: case 2) and compute length of intron: case 3
- these are our values x
-
- | y_1 if x <= x_1
- |
- | x_i+1 - x x - x_i
- f(x) = | y_i * ----------- + y_i+1 * ----------- if x_i <= x <= x_i+1
- | x_i+1 - x_i x_i+1 - x_i
- |
- | y_30 if x_n <= x
-
- y_i and y_i+1 parameters, so the fractions are saved in the weight vectors!
- """
+# 3 * 30 supporting points: x_1 ... x_30
+# => 3 * 30 parameters (1 .. 90): y_1 ... y_30
+# piecewise linear function:
+#
+# take score from SVM vektor (don_supp: case 1, acc_supp: case 2) and compute length of intron: case 3
+# these are our values x
+#
+# | y_1 if x <= x_1
+# |
+# | x_i+1 - x x - x_i
+# f(x) = | y_i * ----------- + y_i+1 * ----------- if x_i <= x <= x_i+1
+# | x_i+1 - x_i x_i+1 - x_i
+# |
+# | y_30 if x_n <= x
+#
+# y_i and y_i+1 parameters, so the fractions are saved in the weight vectors!
+def calculateWeights(plf, scores):
currentWeight = zeros((30,1))
for k in range(len(scores)):
value = scores[k]
-
- Lower = 0
Lower = len([elem for elem in plf.limits if elem <= value])
Upper = Lower+1 ; # x-werte bleiben fest
+ print value,Lower,Upper
+
if Lower == 0:
- currentWeight[0] = currentWeight[0] + 1;
+ currentWeight[0] = currentWeight[0] + 1
elif Lower == len(plf.limits):
- currentWeight[-1] = currentWeight[-1] + 1;
+ currentWeight[-1] = currentWeight[-1] + 1
else:
- weightup = (value - plf.limits[Lower]) / (plf.limits[Upper] - plf.limits[Lower]) ;
- weightlow = (plf.limits[Upper] - value) / (plf.limits[Upper] - plf.limits[Lower]) ;
- currentWeight[Upper] = currentWeight[Upper] + weightup;
- currentWeight[Lower] = currentWeight[Lower] + weightlow;
+ weightup = (value - plf.limits[Lower]) / (plf.limits[Upper] - plf.limits[Lower])
+ weightlow = (plf.limits[Upper] - value) / (plf.limits[Upper] - plf.limits[Lower])
+ currentWeight[Upper] = currentWeight[Upper] + weightup
+ currentWeight[Lower] = currentWeight[Lower] + weightlow
- return currentWeight
+ print weightup,weightlow,currentWeight[Upper],currentWeight[Lower]
+ return currentWeight
def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp):
- #assert(isempty(d.transform))
- #assert(isempty(a.transform))
- #assert(isempty(h.transform))
-
####################################################################################
# 1. Donor: In don_supp stehen Werte der SVM., in SpliceAlign die Einsen
####################################################################################
# Picke die Positionen raus, an denen eine Donorstelle ist
- #DonorScores = don_supp(find(SpliceAlign==1)) ;
- #assert(all(DonorScores~=-Inf)) ;
-
DonorScores = [elem for pos,elem in enumerate(don_supp) if SpliceAlign[pos] == 1]
assert not ( -inf in DonorScores )
+ print 'donor'
weightDon = calculateWeights(d,DonorScores)
####################################################################################
# 2. Acceptor: In acc_supp stehen Werte der SVM., in SpliceAlign die Einsen
####################################################################################
- #AcceptorScores = acc_supp(find(SpliceAlign == 2)+1) ;
- #assert(all(AcceptorScores~=-Inf)) ;
-
#Den Vektor Acceptorstellen durchgehen und die Gewichtsvektoren belasten:
- AcceptorScores = [elem for pos,elem in enumerate(acc_supp) if SpliceAlign[pos+1] == 2]
- #assert not ( -inf in AcceptorScores )
+ AcceptorScores = [elem for pos,elem in enumerate(acc_supp) if pos > 0 and SpliceAlign[pos-1] == 2]
+ assert not ( -inf in AcceptorScores )
+ print 'acceptor'
weightAcc = calculateWeights(a,AcceptorScores)
####################################################################################
# 3. Intron length: SpliceAlign: Gaps zaehlen und auf Gapgewichte addieren
####################################################################################
- #intron_starts = find(SpliceAlign==1) ;
- #intron_ends = find(SpliceAlign==2) ;
+ intron_starts = []
+ intron_ends = []
+ for pos,elem in enumerate(SpliceAlign):
+ if elem == 1:
+ intron_starts.append(pos)
+
+ if elem == 2:
+ intron_ends.append(pos)
+
+ assert len(intron_starts) == len(intron_ends)
+
+ for i in range(len(intron_starts)):
+ assert intron_starts[i] < intron_ends[i]
- #%assert: introns do not overlap
- #assert(length(intron_starts) == length(intron_ends)) ;
- #assert(all(intron_starts < intron_ends)) ;
- #assert(all(intron_ends(1:end-1) < intron_starts(2:end))) ;
+ values = [0.0]*len(intron_starts)
+ for pos in range(len(intron_starts)):
+ values[pos] = intron_ends[pos] - intron_starts[pos] + 1
- #weightIntron = calculateWeights(h,AcceptorScores)
- weightIntron = [1,2,3]
+ weightIntron = calculateWeights(h,values)
return weightDon, weightAcc, weightIntron
import scipy.io
from paths_load_data import *
import pdb
-import numpy.matlib
+from numpy.matlib import mat,zeros,ones
from set_param_palma import *
from computeSpliceAlign import *
import QPalmaDP
from SIQP_CPX import SIQPSolver
-from penalty_lookup import *
+from penalty_lookup_new import *
from compute_donacc import *
+from TrainingParam import Param
+
"""
A training method for the QPalma project
"""
-class Param:
-
- def __init__(self):
- """ default parameters """
-
- self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma'
- self.MAX_MEM = 31000;
- self.LOCAL_ALIGN = 1;
- self.init_param = 1;
- self.train_with_splicesitescoreinformation = 1;
- self.train_with_intronlengthinformation = 1;
- self.C = 0.001;
- self.microexon = 0;
- self.prob = 0;
- self.organism = 'elegans'
- self.expt = 'training'
-
- self.insertion_prob = self.prob/3 ;
- self.deletion_prob = self.prob/3 ;
- self.mutation_prob = self.prob/3 ;
+def plog(fh,string):
+ fh.write(string)
def run():
ARGS = Param()
anzpath = 2
print_matrix = 0
- param = numpy.matlib.rand(126,1)
+ # param = numpy.matlib.rand(126,1)
+ param = numpy.matlib.mat([[ 0.62870709], [ 0.7012026 ], [ 0.60236784],
+ [ 0.51316259], [ 0.20220814], [ 0.70324863], [ 0.37218684], [ 0.82178927],
+ [ 0.60394866], [ 0.70371272], [ 0.07548074], [ 0.63412803], [ 0.97442266],
+ [ 0.13216791], [ 0.71041168], [ 0.2093887 ], [ 0.35227344], [ 0.3405142 ],
+ [ 0.69677236], [ 0.41673747], [ 0.564245 ], [ 0.37613432], [ 0.88805642],
+ [ 0.88691608], [ 0.69476752], [ 0.81659504], [ 0.17801859], [ 0.71048235],
+ [ 0.08188783], [ 0.54884803], [ 0.84039558], [ 0.6982093 ], [ 0.41686176],
+ [ 0.38568873], [ 0.29401347], [ 0.12704074], [ 0.30640858], [ 0.89578031],
+ [ 0.84621571], [ 0.11783439], [ 0.0944695 ], [ 0.34081575], [ 0.44157643],
+ [ 0.77847185], [ 0.04283567], [ 0.45107823], [ 0.89789891], [ 0.41045519],
+ [ 0.49073531], [ 0.29727627], [ 0.94711483], [ 0.24898204], [ 0.26181212],
+ [ 0.71760957], [ 0.60326883], [ 0.80887576], [ 0.09448718], [ 0.88064525],
+ [ 0.84317654], [ 0.48893703], [ 0.24847021], [ 0.84203596], [ 0.34104156],
+ [ 0.75604701], [ 0.91703057], [ 0.69325475], [ 0.61276969], [ 0.16335226],
+ [ 0.4684374 ], [ 0.16553371], [ 0.79594434], [ 0.6440283 ], [ 0.80922237],
+ [ 0.5349296 ], [ 0.31924316], [ 0.10960695], [ 0.40151062], [ 0.50473641],
+ [ 0.14812671], [ 0.73523169], [ 0.35141625], [ 0.80364238], [ 0.02128181],
+ [ 0.0061226 ], [ 0.34541924], [ 0.07694485], [ 0.05551339], [ 0.23087636],
+ [ 0.87016395], [ 0.31682377], [ 0.27375113], [ 0.72226332], [ 0.62914149],
+ [ 0.59236012], [ 0.2070238 ], [ 0.52390942], [ 0.11894098], [ 0.55725917],
+ [ 0.72706009], [ 0.087196 ], [ 0.04745082], [ 0.95636492], [ 0.31524576],
+ [ 0.79685218], [ 0.80386771], [ 0.70942604], [ 0.82869417], [ 0.26906569],
+ [ 0.51848039], [ 0.64169354], [ 0.07114973], [ 0.39249454], [ 0.07002803],
+ [ 0.94667567], [ 0.02252752], [ 0.01039039], [ 0.5721312 ], [ 0.06065969],
+ [ 0.69422476], [ 0.4310939 ], [ 0.03069099], [ 0.35969779], [ 0.18047331],
+ [ 0.4177651 ], [ 0.01360547], [ 0.29069319]])
[h,d,a,mmatrix] = set_param_palma(param,ARGS.train_with_intronlengthinformation)
+ # checked values till this position
+
# delete splicesite-score-information
if not ARGS.train_with_splicesitescoreinformation:
for i in range(len(Acceptors)):
numExamples = N
C=1.0
- numDonorSupportPoints = 30
- numAcceptorSupportPoints = 30
- numIntronLengthSupportPoints = 30
- sizeMMatrix = 36
- numQualityScoreSupportPoints = 16*0
+ numDonSuppPoints = 30
+ numAccSuppPoints = 30
+ numLengthSuppPoints = 30
+ sizeMMatrix = 36
+ numQualSuppPoints = 16*0
- numFeatures = numDonorSupportPoints\
- + numAcceptorSupportPoints\
- + numIntronLengthSupportPoints\
- + sizeMMatrix\
- + numQualityScoreSupportPoints
+ numFeatures = numDonSuppPoints + numAccSuppPoints + + numLengthSuppPoints\
+ + sizeMMatrix + numQualSuppPoints
- qualityMatrix = [0.0]*numQualityScoreSupportPoints
+ qualityMatrix = zeros((numQualSuppPoints,1))
plog(logfh,'Initializing problem...\n')
+
#problem = SIQPSolver(numFeatures,numExamples,C,logfile)
- #num_path = anzpath*ones(1,N) ; # nr of alignments done (best path, second-best path etc.)
- #gap = zeros(1,N) ; %? sum of differences between true alignment score and 'false' alignment scores
- #for step_nr=1:iteration_steps,
+ num_path = anzpath*ones((1,N)) ; # nr of alignments done (best path, second-best path etc.)
+ gap = zeros((1,N))
plog(logfh,'Starting training...\n')
acc_supp = Acceptors[exampleId]
# Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
+
+ # trueSpliceAlign is equal
+ # trueWeightMatch is equal
trueSpliceAlign, trueWeightMatch = computeSpliceAlign(dna, exons)
+ #print d.limits
+ #print d.penalties
+ #print a.limits
+ #print a.penalties
+ #print h.limits
+ #print h.penalties
+
+ ####################### checked above values ##########################################
+
# Calculate
trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+ #print trueWeightDon.T
+ #print trueWeightAcc.T
+ #print trueWeightIntron.T
+
+ pdb.set_trace()
+
# Reshape currentW param
currentW = zeros((numFeatures,1))
- currentW[0:numDonorSupportPoints,1] = trueWeigthDon[:]
- currentW[0:numAcceptorSupportPoints,1] = trueWeigthAcc[:]
- currentW[0:numIntronLengthSupportPoints,1] = trueWeigthIntron[:]
- currentW[0:numDonorSupportPoints,1] = trueWeigthMatch[:]
-
- #currentPhi = d.penalties a h mmatrix
+
+ currentW[0:numDonSuppPoints,0] = trueWeightDon[:,0]
+ currentW[numDonSuppPoints:numDonSuppPoints+numAccSuppPoints,0] = trueWeightAcc[:,0]
+ currentW[numDonSuppPoints+numAccSuppPoints:numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints,0] = trueWeightIntron[:,0]
+ currentW[numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints:numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints+sizeMMatrix,0] = trueWeightMatch[:,0]
+ currentW[numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints+sizeMMatrix:numDonSuppPoints+numAccSuppPoints+numLengthSuppPoints+sizeMMatrix+numQualSuppPoints,0] = qualityMatrix[:,0]
+
+ currentPhi = zeros((numFeatures,1))
+ currentPhi[0:30] = d.penalties[:]
+ currentPhi[30:60] = a.penalties[:]
+ currentPhi[60:90] = h.penalties[:]
+ currentPhi[90:126] = mmatrix[:]
+ currentPhi[126:] = qualityMatrix[:]
# Calculate w'phi(x,y) the total score of the alignment
- #alignmentScore = currentW.T * currentPhi
+ alignmentScore = currentW.T * currentPhi
#
# Calculate wrong alignments
#
+
+ # Compute donor, acceptor with penalty_lookup_new
+ # returns two double lists
+ donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+ #myalign wants the acceptor site on the g of the ag
+ #acceptor = [acceptor(2:end) -Inf] ;
+
nr_paths = 2
dna = 'acgtagct'
dna_len = len(dna)
currentAlignment = QPalmaDP.Alignment()
qualityMat = QPalmaDP.createDoubleArrayFromList(qualityMatrix)
- currentAlignment.setQualityMatrix(qualityMat,numQualitySupportPoints)
+ currentAlignment.setQualityMatrix(qualityMat,numQualSuppPoints)
ps = h.convert2SWIG()
currentAlignment.myalign( nr_paths, dna, dna_len,\
print 'after myalign call...'
- # # Compute donor, acceptor with penalty_lookup_new
- # # returns two double lists
- # [donor, acceptor] = compute_donacc(don_supp, acc_supp, d, a) ;
-
- # #myalign wants the acceptor site on the g of the ag
- # acceptor = [acceptor(2:end) -Inf] ;
- #
- # # call myalign
- # [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = ...
- # myalign_local(num_path(id), dna, est, {h}, mmatrix, donor, acceptor, ...
- # remove_duplicate_scores, print_matrix);
-
# SpliceAlign = double(SpliceAlign') ; %column
# weightMatch = double(weightMatch') ;
+
iteration_nr += 1
break
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Wrong Alignments
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
- % Compute donor, acceptor with penalty_lookup_new
- [donor, acceptor] = compute_donacc(don_supp, acc_supp, d, a) ;
-
- %myalign wants the acceptor site on the g of the ag
- acceptor = [acceptor(2:end) -Inf] ;
-
- %call myalign
- [SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest] = ...
- myalign_local(num_path(id), dna, est, {h}, mmatrix, donor, acceptor, ...
- remove_duplicate_scores, print_matrix);
-
- %return values (are int32, have to be double later cause we compute scores
- SpliceAlign = double(SpliceAlign') ; %column
- weightMatch = double(weightMatch') ;
-
%test
for pfadNo = 1:num_path(id)
assert(sum(weightMatch(pfadNo,7:end)) == sum(SpliceAlign(pfadNo,:)==0)) ;
sum_xis += elem
"""
-
print 'Training completed'
if __name__ == '__main__':
-load confirmed_sequences.mat;
+%
+% This script compares two alignments based on their EST overlaps
+%
+% The ground_truth and testrun
+%
+load /fml/ag-raetsch/home/fabio/tmp/zebrafish/confirmed_sequences.mat;
testrun = genes;
load /fml/ag-raetsch/share/projects/altsplicedata/zebrafish/confirmed_sequences.mat;
ground_truth = genes;
clear genes;
+fh = fopen('overlapping.pos','w+')
for i = 1:length(ground_truth)
-
- currentTru = ground_truth(i);
- currentTruExons = currentTru.exons{1};
+ currentEntry = ground_truth(i);
+ currentExons = currentEntry.exons;
+ assert (length(currentEntry.transcripts) == length(currentEntry.exons));
+ numberOfEsts = length(currentEntry.transcripts);
for j = 1:length(testrun)
currentPred = testrun(j);
- if strcmp(currentTru.chr,currentPred.chr) && strcmp(currentTru.strands,currentPred.strands)
+ if strcmp(currentEntry.chr,currentPred.chr) && currentEntry.is_alt_spliced
continue;
end
- truStart = currentTru.start;
- truStop = currentTru.stop;
+ start = currentEntry.start;
+ stop = currentEntry.stop;
predStart = currentPred.start;
predStop = currentPred.stop;
- if predStop < truStart || predStart > truStop
+ % if both entries are not overlapping at all
+ if predStop < start || predStart > stop
continue;
end
- currentPredExons = currentPred.exons{1};
-
- rows = size(currentTruExons,1);
- for ii = 1:rows
- exonStart = currentTruExons(ii,1);
- exonStop = currentTruExons(ii,2);
- for jj = 1:size(currentPredExons,1)
- predExonStart = currentPredExons(jj,1);
- predExonStop = currentPredExons(jj,2);
-
- if exonStart >= predExonStart && exonStart <= predExonStop
- disp('Overlapping');
- % end of exonth is nested inside predExoniction
- elseif exonStop >= predExonStart && exonStop <= predExonStop
- disp('Overlapping');
- % predExoniction is nested inside exonth
- elseif exonStart <= predExonStart && predExonStop <= exonStop
- disp('Overlapping');
- else
- d=1;
+ currentPredExons = currentPred.exons;
+ numberOfPredEsts = length(currentPredExons);
+
+ % Loop over all ESTs in ground truth and testrun
+ for estIdx = 1:numberOfEsts
+ %disp(sprintf('estIdx %i ',estIdx));
+ numberOfExons = size(currentExons{estIdx},1);
+
+ for exonIdx = 1:numberOfExons
+ %disp(sprintf('exonIdx %i ',exonIdx));
+ exonStart = currentExons{estIdx}(exonIdx,1);
+ exonStop = currentExons{estIdx}(exonIdx,2);
+
+ for predEstIdx = 1:numberOfPredEsts
+ %disp(sprintf('predEstIdx %i ',predEstIdx));
+ numberOfPredExons = size(currentPredExons{predEstIdx},1);
+ currentESTName = currentPred.transcripts{predEstIdx};
+
+ for predExonIdx = 1:numberOfPredExons
+ %%disp(sprintf('predExonIdx %i ',predExonIdx));
+ predExonStart = currentPredExons{predEstIdx}(predExonIdx,1);
+ predExonStop = currentPredExons{predEstIdx}(predExonIdx,2);
+
+ %disp('\n');
+ %%disp(sprintf('%i %i %i %i %i %i\n',i,j,estIdx,predEstIdx,exonIdx,predExonIdx));
+
+ if exonStart >= predExonStart && exonStart <= predExonStop
+ %%disp('Overlapping');
+ fprintf(fh,sprintf('%s before %d\n',currentESTName,exonStart-predExonStart))
+ % end of exonth is nested inside predExoniction
+ elseif exonStop >= predExonStart && exonStop <= predExonStop
+ %%disp('Overlapping');
+ fprintf(fh,sprintf('%s after %d\n',currentESTName,predExonStop-exonStop))
+ % predExoniction is nested inside exonth
+ elseif exonStart <= predExonStart && predExonStop <= exonStop
+ %disp('Overlapping');
+ fprintf('%s %d %d',fh)
+ else
+ d=1;
+ end
+ end
end
-
end
end
end