###########################################################
#
+# The QPalma project aims at extending the Palma project
+# to be able to use Solexa reads toegether with their
+# quality scores.
+#
+# This file represents the conversion of the main matlab
+# training loop for Palma to python.
+#
+# Author: Fabio De Bona
#
-#
###########################################################
import sys
import QPalmaDP
import qpalma
+import qpalma.Configuration
+
from qpalma.SIQP_CPX import SIQPSolver
from qpalma.DataProc import *
-
from qpalma.generateEvaluationData import *
from qpalma.computeSpliceWeights import *
from qpalma.set_param_palma import *
from qpalma.computeSpliceAlignWithQuality import *
from qpalma.penalty_lookup_new import *
from qpalma.compute_donacc import *
-from qpalma.TrainingParam import Param
from qpalma.export_param import *
-
-import qpalma.Configuration
+from qpalma.TrainingParam import Param
from qpalma.Plif import Plf
-from qpalma.Helpers import *
from qpalma.tools.splicesites import getDonAccScores
-def getQualityFeatureCounts(qualityPlifs):
- weightQuality = qualityPlifs[0].penalties
- for currentPlif in qualityPlifs[1:]:
- weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
-
- return weightQuality
+class para:
+ pass
class QPalma:
"""
#self.ARGS.train_with_splicesitescoreinformation = False
- def plog(self,string):
- self.logfh.write(string)
- self.logfh.flush()
-
- def run(self):
# Load the whole dataset
if Configuration.mode == 'normal':
#Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+ self.use_quality_scores = False
- Donors, Acceptors = getDonAccScores(Sequences)
-
- use_quality_scores = False
elif Configuration.mode == 'using_quality_scores':
-
filename = 'real_dset_%s'% 'recent'
if True:# not os.path.exists(filename):
Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
end = 1000
- Sequences = Sequences[:end]
- Exons = Exons[:end]
- Ests = Ests[:end]
- Qualities = Qualities[:end]
- Acceptors = Acceptors[:end]
- Donors = Donors[:end]
-
- pdb.set_trace()
-
- Donors, Acceptors = getDonAccScores(Sequences)
+ self.Sequences = Sequences[:end]
+ self.Exons = Exons[:end]
+ self.Ests = Ests[:end]
+ self.Qualities = Qualities[:end]
+ self.Acceptors = Acceptors[:end]
+ self.Donors = Donors[:end]
+
+ self.Donors, self.Acceptors = getDonAccScores(Sequences)
#dset = Dataset()
- #dset.Sequences = Sequences
- #dset.Acceptor = Acceptors
- #dset.Donors = Donors
- #dset.Exons = Exons
- #dset.Ests = Ests
- #dset.Qualities = Qualities
- #cPickle.dump(dset,open(filename,'w+'))
else:
dset = cPickle.load(open(filename))
Sequences = dset.Sequences
#Qualities = []
#for i in range(len(Ests)):
# Qualities.append([40]*len(Ests[i]))
- use_quality_scores = True
+ self.use_quality_scores = True
else:
assert(False)
self.plog('Number of training examples: %d\n'% N)
print 'Number of features: %d\n'% Configuration.numFeatures
+ def plog(self,string):
+ self.logfh.write(string)
+ self.logfh.flush()
+
+ def train(self):
+ beg = Configuration.training_begin
+ end = Configuration.training_end
+
+ Sequences = self.Sequences[beg:end]
+ Exons = self.Exons[beg:end]
+ Ests = self.Ests[beg:end]
+ Qualities = self.Qualities[beg:end]
+ Acceptors = self.Acceptors[beg:end]
+ Donors = self.Donors[beg:end]
+
iteration_steps = Configuration.iter_steps ; #upper bound on iteration steps
remove_duplicate_scores = Configuration.remove_duplicate_scores
print_matrix = Configuration.print_matrix
solver = SIQPSolver(Configuration.numFeatures,self.numExamples,Configuration.C,self.logfh)
# stores the number of alignments done for each example (best path, second-best path etc.)
- num_path = [anzpath]*N
+ num_path = [anzpath]*self.numExamples
# stores the gap for each example
- gap = [0.0]*N
+ gap = [0.0]*self.numExamples
#############################################################################################
# Training
don_supp = Donors[exampleIdx]
acc_supp = Acceptors[exampleIdx]
- #if exons[-1,1] > len(dna):
- # continue
-
- #pdb.set_trace()
# Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-
currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
# Create the alignment object representing the interface to the C/C++ code.
- currentAlignment = QPalmaDP.Alignment(Configuration.numQualPlifs,Configuration.numQualSuppPoints, use_quality_scores)
+ currentAlignment = QPalmaDP.Alignment(Configuration.numQualPlifs,Configuration.numQualSuppPoints, self.use_quality_scores)
c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
#print 'Calling myalign...'
newDPScores[i] = c_DPScores[i]
- if use_quality_scores:
+ if self.use_quality_scores:
for i in range(Configuration.totalQualSuppPoints*num_path[exampleIdx]):
newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
#cPickle.dump(pa,open('elegans.param','w+'))
self.logfh.close()
+
+
+ def predict(self):
+ beg = Configuration.prediction_begin
+ end = Configuration.prediction_end
+
+ Sequences = self.Sequences[beg:end]
+ Exons = self.Exons[beg:end]
+ Ests = self.Ests[beg:end]
+ Qualities = self.Qualities[beg:end]
+ Acceptors = self.Acceptors[beg:end]
+ Donors = self.Donors[beg:end]
+
+ remove_duplicate_scores = Conf.remove_duplicate_scores
+ print_matrix = Conf.print_matrix
+ anzpath = Conf.anzpath
+
+ param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_28.pickle'
+ param = load_param(param_filename)
+
+ # Set the parameters such as limits penalties for the Plifs
+ [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+ #############################################################################################
+ # Prediction
+ #############################################################################################
+ self.plog('Starting prediction...\n')
+
+ donSP = Conf.numDonSuppPoints
+ accSP = Conf.numAccSuppPoints
+ lengthSP = Conf.numLengthSuppPoints
+ mmatrixSP = Conf.sizeMatchmatrix[0]\
+ *Conf.sizeMatchmatrix[1]
+ numq = Conf.numQualSuppPoints
+ totalQualSP = Conf.totalQualSuppPoints
+
+ currentPhi = zeros((Conf.numFeatures,1))
+ totalQualityPenalties = zeros((totalQualSP,1))
+
+ total_up_off = []
+ total_down_off = []
+
+ for exampleIdx in range(self.numExamples):
+ dna = Sequences[exampleIdx]
+ est = Ests[exampleIdx]
+ currentSplitPos = SplitPos[exampleIdx]
+
+ if Conf.mode == 'normal':
+ quality = [40]*len(est)
+
+ if Conf.mode == 'using_quality_scores':
+ quality = Qualities[exampleIdx]
+
+ exons = Exons[exampleIdx]
+ # NoiseMatrix = Noises[exampleIdx]
+ don_supp = Donors[exampleIdx]
+ acc_supp = Acceptors[exampleIdx]
+
+ if exons[-1,1] > len(dna):
+ continue
+
+ # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
+ trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+
+ # Calculate the weights
+ trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+ trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+ currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
+ currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
+ currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
+ currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
+
+ if Conf.mode == 'using_quality_scores':
+ totalQualityPenalties = param[-totalQualSP:]
+ currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
+
+ # Calculate w'phi(x,y) the total score of the alignment
+ trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+
+ # The allWeights vector is supposed to store the weight parameter
+ # of the true alignment as well as the weight parameters of the
+ # 1 other alignments
+ allWeights = zeros((Conf.numFeatures,1+1))
+ allWeights[:,0] = trueWeight[:,0]
+
+ AlignmentScores = [0.0]*(1+1)
+ AlignmentScores[0] = trueAlignmentScore
+
+ ################## Calculate wrong alignment(s) ######################
+
+ # Compute donor, acceptor with penalty_lookup_new
+ # returns two double lists
+ donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+ #myalign wants the acceptor site on the g of the ag
+ acceptor = acceptor[1:]
+ acceptor.append(-inf)
+
+ dna = str(dna)
+ est = str(est)
+ dna_len = len(dna)
+ est_len = len(est)
+
+ ps = h.convert2SWIG()
+
+ prb = QPalmaDP.createDoubleArrayFromList(quality)
+ chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
+ matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+ mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
+
+ d_len = len(donor)
+ donor = QPalmaDP.createDoubleArrayFromList(donor)
+ a_len = len(acceptor)
+ acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+ # Create the alignment object representing the interface to the C/C++ code.
+ currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
+
+ c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+
+ # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+ currentAlignment.myalign(1, dna, dna_len,\
+ est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+ acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
+ print_matrix)
+
+ c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
+ c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
+ c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
+ c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
+
+ c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
+
+ currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+ c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
+
+ newSpliceAlign = zeros((dna_len,1))
+ newEstAlign = zeros((est_len,1))
+ newWeightMatch = zeros((mm_len,1))
+ newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
+
+ #print 'newSpliceAlign'
+ for i in range(dna_len):
+ newSpliceAlign[i] = c_SpliceAlign[i]
+ # print '%f' % (spliceAlign[i])
+
+ #print 'newEstAlign'
+ for i in range(est_len):
+ newEstAlign[i] = c_EstAlign[i]
+ # print '%f' % (spliceAlign[i])
+
+ #print 'weightMatch'
+ for i in range(mm_len):
+ newWeightMatch[i] = c_WeightMatch[i]
+ # print '%f' % (weightMatch[i])
+
+ newDPScores = c_DPScores[0]
+
+ for i in range(Conf.totalQualSuppPoints):
+ newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
+
+ #print "Calling destructors"
+ del c_SpliceAlign
+ del c_EstAlign
+ del c_WeightMatch
+ del c_DPScores
+ del c_qualityPlifsFeatures
+ del currentAlignment
+
+ newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
+ newWeightMatch = newWeightMatch.reshape(1,mm_len)
+ # Calculate weights of the respective alignments Note that we are
+ # calculating n-best alignments without hamming loss, so we
+ # have to keep track which of the n-best alignments correspond to
+ # the true one in order not to incorporate a true alignment in the
+ # constraints. To keep track of the true and false alignments we
+ # define an array true_map with a boolean indicating the
+ # equivalence to the true alignment for each decoded alignment.
+ true_map = [0]*2
+ true_map[0] = 1
+ pathNr = 0
+
+ weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
+
+ decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
+ for qidx in range(Conf.totalQualSuppPoints):
+ decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
+
+ # Gewichte in restliche Zeilen der Matrix speichern
+ wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
+ allWeights[:,pathNr+1] = wp
+
+ hpen = mat(h.penalties).reshape(len(h.penalties),1)
+ dpen = mat(d.penalties).reshape(len(d.penalties),1)
+ apen = mat(a.penalties).reshape(len(a.penalties),1)
+ features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+ AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+
+ # if the pathNr-best alignment is very close to the true alignment consider it as true
+ if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
+ true_map[pathNr+1] = 1
+
+ up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+ #evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+ #print up_off,down_off
+
+ if up_off > -1:
+ total_up_off.append(up_off)
+ total_down_off.append(down_off)
+
+ total_up = 0
+ total_down = 0
+ for idx in range(len(total_up_off)):
+ total_up += total_up_off[idx]
+ total_down += total_down_off[idx]
+
+ total_up /= len(total_up_off)
+ total_down /= len(total_down_off)
+
+ print 'Mean up_off is %f' % total_up
+ print 'Mean down_off is %f' % total_down
+ print 'Correct up_off len is %d' % len([elem for elem in total_up_off if elem in [0,1]])
+ print 'Correct down_off len is %d' % len([elem for elem in total_down_off if elem in [0,1]])
+
+ print 'Prediction completed'
+ self.logfh.close()
+
def fetch_genome_info(ginfo_filename):
if not os.path.exists(ginfo_filename):
cmd = ['']*4
else:
return cPickle.load(open(ginfo_filename))
+def plifs2param(h,d,a,mmatrix,qualityPlifs):
+ donSP = Conf.numDonSuppPoints
+ accSP = Conf.numAccSuppPoints
+ lengthSP = Conf.numLengthSuppPoints
+ mmatrixSP = Conf.sizeMatchmatrix[0]\
+ *Conf.sizeMatchmatrix[1]
+
+
+ param = zeros((Conf.numFeatures,1))
+ param[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
+ param[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
+ param[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
+ param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
+
+ for idx in range(len(qualityPlifs)):
+ currentPlif = qualityPlifs[idx]
+ begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
+ end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
+ param[begin:end] = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
+
+ return param
+
+def load_param(filename):
+ param = None
+ #try:
+ # para = cPickle.load(open(filename))
+ #except:
+ # print 'Error: Could not open parameter file!'
+ # sys.exit(1)
+ #
+ #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
+
+ param = cPickle.load(open(filename))
+ return param
+
+def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
+ newExons = []
+ oldElem = -1
+ SpliceAlign = SpliceAlign.flatten().tolist()[0]
+ SpliceAlign.append(-1)
+ for pos,elem in enumerate(SpliceAlign):
+ if pos == 0:
+ oldElem = -1
+ else:
+ oldElem = SpliceAlign[pos-1]
+
+ if oldElem != 0 and elem == 0: # start of exon
+ newExons.append(pos-1)
+
+ if oldElem == 0 and elem != 0: # end of exon
+ newExons.append(pos)
+
+ up_off = -1
+ down_off = -1
+
+ if len(newExons) != 4:
+ acc = 0.0
+ else:
+ e1_begin,e1_end = newExons[0],newExons[1]
+ e2_begin,e2_end = newExons[2],newExons[3]
+
+ up_off = int(math.fabs(e1_end - exons[0,1]))
+ down_off = int(math.fabs(e2_begin - exons[1,0]))
+
+ #pdb.set_trace()
+
+ return up_off,down_off
+
if __name__ == '__main__':
+
qpalma = QPalma()
- qpalma.run()
+ if len(sys.argv) != 2:
+ print 'You have to choose between training or prediction mode:'
+ print 'python qpalma. py (train|predict)'
+
+ mode = sys.argv[1]
+ assert mode in ['train','predict']
+
+ if mode == 'train':
+ qpalma.train()
+
+ if mode == 'predict':
+ qpalma.predict()
+