+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import numpy as ny
-import numpy.matlib as nyml
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###########################################################
+#
+# The QPalma project aims at extending the Palma project
+# to be able to use Solexa reads toegether with their
+# quality scores.
+#
+# This file represents the conversion of the main matlab
+# training loop for Palma to python.
+#
+# Author: Fabio De Bona
+#
+###########################################################
+
+import sys
+import subprocess
+import scipy.io
+import pdb
+import os.path
+
+from numpy.matlib import mat,zeros,ones,inf
+from numpy.linalg import norm
+
+import QPalmaDP
+
+import qpalma
+import qpalma.Configuration
+
+from qpalma.SIQP_CPX import SIQPSolver
+from qpalma.DataProc import *
+from qpalma.generateEvaluationData import *
+from qpalma.computeSpliceWeights import *
+from qpalma.set_param_palma import *
+from qpalma.computeSpliceAlignWithQuality import *
+from qpalma.penalty_lookup_new import *
+from qpalma.compute_donacc import *
+from qpalma.export_param import *
+from qpalma.TrainingParam import Param
+from qpalma.Plif import Plf
+
+from qpalma.tools.splicesites import getDonAccScores
+
+class para:
+ pass
+
+class QPalma:
+ """
+ A training method for the QPalma project
+ """
+
+ def __init__(self):
+ self.ARGS = Param()
+ self.logfh = open('qpalma.log','w+')
+
+ #gen_file= '%s/genome.config' % self.ARGS.basedir
+ #ginfo_filename = 'genome_info.pickle'
+ #self.genome_info = fetch_genome_info(ginfo_filename)
+ #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
+
+ #self.ARGS.train_with_splicesitescoreinformation = False
+
+ # Load the whole dataset
+ if Configuration.mode == 'normal':
+ #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
+ Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+ self.use_quality_scores = False
+
+ elif Configuration.mode == 'using_quality_scores':
+
+ filename = 'real_dset_%s'% 'recent'
+ if True:# not os.path.exists(filename):
+ Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
+
+ end = 1000
+ self.Sequences = Sequences[:end]
+ self.Exons = Exons[:end]
+ self.Ests = Ests[:end]
+ self.Qualities = Qualities[:end]
+ self.Acceptors = Acceptors[:end]
+ self.Donors = Donors[:end]
+
+ self.Donors, self.Acceptors = getDonAccScores(Sequences)
+ #dset = Dataset()
+ else:
+ dset = cPickle.load(open(filename))
+ Sequences = dset.Sequences
+ Acceptors = dset.Acceptors
+ Donors = dset.Donors
+ Exons = dset.Exons
+ Ests = dset.Ests
+ Qualities = dset.Qualities
+
+ #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+ #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
+ #Qualities = []
+ #for i in range(len(Ests)):
+ # Qualities.append([40]*len(Ests[i]))
+ self.use_quality_scores = True
+ else:
+ assert(False)
+
+ # number of training instances
+ N = len(Sequences)
+ self.numExamples = N
+ assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
+ and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
+ self.plog('Number of training examples: %d\n'% N)
+ print 'Number of features: %d\n'% Configuration.numFeatures
+
+ def plog(self,string):
+ self.logfh.write(string)
+ self.logfh.flush()
+
+ def train(self):
+ beg = Configuration.training_begin
+ end = Configuration.training_end
+
+ Sequences = self.Sequences[beg:end]
+ Exons = self.Exons[beg:end]
+ Ests = self.Ests[beg:end]
+ Qualities = self.Qualities[beg:end]
+ Acceptors = self.Acceptors[beg:end]
+ Donors = self.Donors[beg:end]
+
+ iteration_steps = Configuration.iter_steps ; #upper bound on iteration steps
+ remove_duplicate_scores = Configuration.remove_duplicate_scores
+ print_matrix = Configuration.print_matrix
+ anzpath = Configuration.anzpath
+
+ # Initialize parameter vector / param = numpy.matlib.rand(126,1)
+ param = Configuration.fixedParam
+
+ # Set the parameters such as limits penalties for the Plifs
+ [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+ # delete splicesite-score-information
+ #if not self.ARGS.train_with_splicesitescoreinformation:
+ # for i in range(len(Acceptors)):
+ # if Acceptors[i] > -20:
+ # Acceptors[i] = 1
+ # if Donors[i] >-20:
+ # Donors[i] = 1
+
+ # Initialize solver
+ if Configuration.USE_OPT:
+ self.plog('Initializing problem...\n')
+ solver = SIQPSolver(Configuration.numFeatures,self.numExamples,Configuration.C,self.logfh)
+
+ # stores the number of alignments done for each example (best path, second-best path etc.)
+ num_path = [anzpath]*self.numExamples
+ # stores the gap for each example
+ gap = [0.0]*self.numExamples
+
+ #############################################################################################
+ # Training
+ #############################################################################################
+ self.plog('Starting training...\n')
+
+ donSP = Configuration.numDonSuppPoints
+ accSP = Configuration.numAccSuppPoints
+ lengthSP = Configuration.numLengthSuppPoints
+ mmatrixSP = Configuration.sizeMatchmatrix[0]\
+ *Configuration.sizeMatchmatrix[1]
+ numq = Configuration.numQualSuppPoints
+ totalQualSP = Configuration.totalQualSuppPoints
+
+ currentPhi = zeros((Configuration.numFeatures,1))
+ totalQualityPenalties = zeros((totalQualSP,1))
+
+ iteration_nr = 0
+ param_idx = 0
+ const_added_ctr = 0
+ while True:
+ if iteration_nr == iteration_steps:
+ break
+
+ for exampleIdx in range(self.numExamples):
+ if (exampleIdx%100) == 0:
+ print 'Current example nr %d' % exampleIdx
+
+ dna = Sequences[exampleIdx]
+ est = Ests[exampleIdx]
+
+ if Configuration.mode == 'normal':
+ quality = [40]*len(est)
+
+ if Configuration.mode == 'using_quality_scores':
+ quality = Qualities[exampleIdx]
+
+ exons = Exons[exampleIdx]
+ # NoiseMatrix = Noises[exampleIdx]
+ don_supp = Donors[exampleIdx]
+ acc_supp = Acceptors[exampleIdx]
+
+ # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
+ trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+
+ #print 'trueWeights'
+ # Calculate the weights
+ trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+ trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+ currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
+ currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
+ currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
+ currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
+
+ #pdb.set_trace()
+ if Configuration.mode == 'using_quality_scores':
+ totalQualityPenalties = param[-totalQualSP:]
+ currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
+
+ # Calculate w'phi(x,y) the total score of the alignment
+ trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+
+ # The allWeights vector is supposed to store the weight parameter
+ # of the true alignment as well as the weight parameters of the
+ # num_path[exampleIdx] other alignments
+ allWeights = zeros((Configuration.numFeatures,num_path[exampleIdx]+1))
+ allWeights[:,0] = trueWeight[:,0]
+
+ AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
+ AlignmentScores[0] = trueAlignmentScore
+
+ ################## Calculate wrong alignment(s) ######################
+
+ # Compute donor, acceptor with penalty_lookup_new
+ # returns two double lists
+ donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+ #myalign wants the acceptor site on the g of the ag
+ acceptor = acceptor[1:]
+ acceptor.append(-inf)
+
+ # for now we don't use donor/acceptor scores
+
+ #donor = [-inf] * len(donor)
+ #acceptor = [-inf] * len(donor)
+
+ dna = str(dna)
+ est = str(est)
+ dna_len = len(dna)
+ est_len = len(est)
+
+ ps = h.convert2SWIG()
+
+ prb = QPalmaDP.createDoubleArrayFromList(quality)
+ chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
+ matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+ mm_len = Configuration.sizeMatchmatrix[0]*Configuration.sizeMatchmatrix[1]
+
+ d_len = len(donor)
+ donor = QPalmaDP.createDoubleArrayFromList(donor)
+ a_len = len(acceptor)
+ acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+ # Create the alignment object representing the interface to the C/C++ code.
+ currentAlignment = QPalmaDP.Alignment(Configuration.numQualPlifs,Configuration.numQualSuppPoints, self.use_quality_scores)
+
+ c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+ #print 'Calling myalign...'
+ # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+ currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
+ est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+ acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
+ print_matrix)
+
+ #print 'After calling myalign...'
+ #print 'Calling getAlignmentResults...'
+
+ c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
+ c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
+ c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
+ c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
+
+ c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Configuration.totalQualSuppPoints*num_path[exampleIdx]))
+
+ currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+ c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
+
+ #print 'After calling getAlignmentResults...'
+
+ newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
+ newEstAlign = zeros((est_len*num_path[exampleIdx],1))
+ newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
+ newDPScores = zeros((num_path[exampleIdx],1))
+ newQualityPlifsFeatures = zeros((Configuration.totalQualSuppPoints*num_path[exampleIdx],1))
+
+ #print 'newSpliceAlign'
+ for i in range(dna_len*num_path[exampleIdx]):
+ newSpliceAlign[i] = c_SpliceAlign[i]
+ # print '%f' % (spliceAlign[i])
+
+ #print 'newEstAlign'
+ for i in range(est_len*num_path[exampleIdx]):
+ newEstAlign[i] = c_EstAlign[i]
+ # print '%f' % (spliceAlign[i])
+
+ #print 'weightMatch'
+ for i in range(mm_len*num_path[exampleIdx]):
+ newWeightMatch[i] = c_WeightMatch[i]
+ # print '%f' % (weightMatch[i])
+
+ #print 'ViterbiScores'
+ for i in range(num_path[exampleIdx]):
+ newDPScores[i] = c_DPScores[i]
+
+
+ if self.use_quality_scores:
+ for i in range(Configuration.totalQualSuppPoints*num_path[exampleIdx]):
+ newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
+
+ # equals palma up to here
+
+ #print "Calling destructors"
+ del c_SpliceAlign
+ del c_EstAlign
+ del c_WeightMatch
+ del c_DPScores
+ del c_qualityPlifsFeatures
+ del currentAlignment
+
+ newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
+ newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
+ # Calculate weights of the respective alignments Note that we are
+ # calculating n-best alignments without hamming loss, so we
+ # have to keep track which of the n-best alignments correspond to
+ # the true one in order not to incorporate a true alignment in the
+ # constraints. To keep track of the true and false alignments we
+ # define an array true_map with a boolean indicating the
+ # equivalence to the true alignment for each decoded alignment.
+ true_map = [0]*(num_path[exampleIdx]+1)
+ true_map[0] = 1
+ path_loss = [0]*(num_path[exampleIdx])
+
+ for pathNr in range(num_path[exampleIdx]):
+ #print 'decodedWeights'
+ weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
+ h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
+ acc_supp)
+
+ decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
+ for qidx in range(Configuration.totalQualSuppPoints):
+ decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Configuration.totalQualSuppPoints)+qidx]
+
+ #pdb.set_trace()
+
+ path_loss[pathNr] = 0
+ # sum up positionwise loss between alignments
+ for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
+ if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
+ path_loss[pathNr] += 1
+
+ #pdb.set_trace()
+
+ # Gewichte in restliche Zeilen der Matrix speichern
+ wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
+ allWeights[:,pathNr+1] = wp
+
+ hpen = mat(h.penalties).reshape(len(h.penalties),1)
+ dpen = mat(d.penalties).reshape(len(d.penalties),1)
+ apen = mat(a.penalties).reshape(len(a.penalties),1)
+ features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+
+ AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+
+ # Check wether scalar product + loss equals viterbi score
+ #print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
+
+ distinct_scores = False
+ if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
+ distinct_scores = True
+
+ #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
+ if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
+ pdb.set_trace()
+
+ # # if the pathNr-best alignment is very close to the true alignment consider it as true
+ if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
+ true_map[pathNr+1] = 1
+
+ # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
+
+ # the true label sequence should not have a larger score than the maximal one WHYYYYY?
+ # this means that all n-best paths are to close to each other
+ # we have to extend the n-best search to a (n+1)-best
+ if len([elem for elem in true_map if elem == 1]) == len(true_map):
+ num_path[exampleIdx] = num_path[exampleIdx]+1
+
+ # Choose true and first false alignment for extending A
+ firstFalseIdx = -1
+ for map_idx,elem in enumerate(true_map):
+ if elem == 0:
+ firstFalseIdx = map_idx
+ break
+
+ # if there is at least one useful false alignment add the
+ # corresponding constraints to the optimization problem
+ if firstFalseIdx != -1:
+ trueWeights = allWeights[:,0]
+ firstFalseWeights = allWeights[:,firstFalseIdx]
+ differenceVector = trueWeights - firstFalseWeights
+ #pdb.set_trace()
+
+ if Configuration.USE_OPT:
+ const_added = solver.addConstraint(differenceVector, exampleIdx)
+ const_added_ctr += 1
+ #
+ # end of one example processing
+ #
+
+ # call solver every nth example //added constraint
+ if exampleIdx != 0 and exampleIdx % 100 == 0 and Configuration.USE_OPT:
+ objValue,w,self.slacks = solver.solve()
+
+ print "objValue is %f" % objValue
+
+ sum_xis = 0
+ for elem in self.slacks:
+ sum_xis += elem
+
+ for i in range(len(param)):
+ param[i] = w[i]
+
+ #pdb.set_trace()
+ cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
+ param_idx += 1
+ [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+ #
+ # end of one iteration through all examples
+ #
+ iteration_nr += 1
+
+ #
+ # end of optimization
+ #
+ print 'Training completed'
+
+ pa = para()
+ pa.h = h
+ pa.d = d
+ pa.a = a
+ pa.mmatrix = mmatrix
+ pa.qualityPlifs = qualityPlifs
+
+ cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
+ #cPickle.dump(pa,open('elegans.param','w+'))
+ self.logfh.close()
+
+
+
+ def predict(self):
+ beg = Configuration.prediction_begin
+ end = Configuration.prediction_end
+
+ Sequences = self.Sequences[beg:end]
+ Exons = self.Exons[beg:end]
+ Ests = self.Ests[beg:end]
+ Qualities = self.Qualities[beg:end]
+ Acceptors = self.Acceptors[beg:end]
+ Donors = self.Donors[beg:end]
+
+ remove_duplicate_scores = Conf.remove_duplicate_scores
+ print_matrix = Conf.print_matrix
+ anzpath = Conf.anzpath
+
+ param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_28.pickle'
+ param = load_param(param_filename)
+
+ # Set the parameters such as limits penalties for the Plifs
+ [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+
+ #############################################################################################
+ # Prediction
+ #############################################################################################
+ self.plog('Starting prediction...\n')
+
+ donSP = Conf.numDonSuppPoints
+ accSP = Conf.numAccSuppPoints
+ lengthSP = Conf.numLengthSuppPoints
+ mmatrixSP = Conf.sizeMatchmatrix[0]\
+ *Conf.sizeMatchmatrix[1]
+ numq = Conf.numQualSuppPoints
+ totalQualSP = Conf.totalQualSuppPoints
+
+ currentPhi = zeros((Conf.numFeatures,1))
+ totalQualityPenalties = zeros((totalQualSP,1))
+
+ total_up_off = []
+ total_down_off = []
+
+ for exampleIdx in range(self.numExamples):
+ dna = Sequences[exampleIdx]
+ est = Ests[exampleIdx]
+ currentSplitPos = SplitPos[exampleIdx]
+
+ if Conf.mode == 'normal':
+ quality = [40]*len(est)
+
+ if Conf.mode == 'using_quality_scores':
+ quality = Qualities[exampleIdx]
+
+ exons = Exons[exampleIdx]
+ # NoiseMatrix = Noises[exampleIdx]
+ don_supp = Donors[exampleIdx]
+ acc_supp = Acceptors[exampleIdx]
+
+ if exons[-1,1] > len(dna):
+ continue
+
+ # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
+ trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+
+ # Calculate the weights
+ trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+ trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+ currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
+ currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
+ currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
+ currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
+
+ if Conf.mode == 'using_quality_scores':
+ totalQualityPenalties = param[-totalQualSP:]
+ currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
+
+ # Calculate w'phi(x,y) the total score of the alignment
+ trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+
+ # The allWeights vector is supposed to store the weight parameter
+ # of the true alignment as well as the weight parameters of the
+ # 1 other alignments
+ allWeights = zeros((Conf.numFeatures,1+1))
+ allWeights[:,0] = trueWeight[:,0]
+
+ AlignmentScores = [0.0]*(1+1)
+ AlignmentScores[0] = trueAlignmentScore
+
+ ################## Calculate wrong alignment(s) ######################
+
+ # Compute donor, acceptor with penalty_lookup_new
+ # returns two double lists
+ donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+ #myalign wants the acceptor site on the g of the ag
+ acceptor = acceptor[1:]
+ acceptor.append(-inf)
+
+ dna = str(dna)
+ est = str(est)
+ dna_len = len(dna)
+ est_len = len(est)
+
+ ps = h.convert2SWIG()
+
+ prb = QPalmaDP.createDoubleArrayFromList(quality)
+ chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
+ matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+ mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
+
+ d_len = len(donor)
+ donor = QPalmaDP.createDoubleArrayFromList(donor)
+ a_len = len(acceptor)
+ acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+ # Create the alignment object representing the interface to the C/C++ code.
+ currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
+
+ c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+
+ # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+ currentAlignment.myalign(1, dna, dna_len,\
+ est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+ acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
+ print_matrix)
+
+ c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
+ c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
+ c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
+ c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
+
+ c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
+
+ currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
+ c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
+
+ newSpliceAlign = zeros((dna_len,1))
+ newEstAlign = zeros((est_len,1))
+ newWeightMatch = zeros((mm_len,1))
+ newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
+
+ #print 'newSpliceAlign'
+ for i in range(dna_len):
+ newSpliceAlign[i] = c_SpliceAlign[i]
+ # print '%f' % (spliceAlign[i])
+
+ #print 'newEstAlign'
+ for i in range(est_len):
+ newEstAlign[i] = c_EstAlign[i]
+ # print '%f' % (spliceAlign[i])
+
+ #print 'weightMatch'
+ for i in range(mm_len):
+ newWeightMatch[i] = c_WeightMatch[i]
+ # print '%f' % (weightMatch[i])
+
+ newDPScores = c_DPScores[0]
+
+ for i in range(Conf.totalQualSuppPoints):
+ newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
+
+ #print "Calling destructors"
+ del c_SpliceAlign
+ del c_EstAlign
+ del c_WeightMatch
+ del c_DPScores
+ del c_qualityPlifsFeatures
+ del currentAlignment
+
+ newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
+ newWeightMatch = newWeightMatch.reshape(1,mm_len)
+ # Calculate weights of the respective alignments Note that we are
+ # calculating n-best alignments without hamming loss, so we
+ # have to keep track which of the n-best alignments correspond to
+ # the true one in order not to incorporate a true alignment in the
+ # constraints. To keep track of the true and false alignments we
+ # define an array true_map with a boolean indicating the
+ # equivalence to the true alignment for each decoded alignment.
+ true_map = [0]*2
+ true_map[0] = 1
+ pathNr = 0
+
+ weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
+
+ decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
+ for qidx in range(Conf.totalQualSuppPoints):
+ decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
+
+ # Gewichte in restliche Zeilen der Matrix speichern
+ wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
+ allWeights[:,pathNr+1] = wp
+
+ hpen = mat(h.penalties).reshape(len(h.penalties),1)
+ dpen = mat(d.penalties).reshape(len(d.penalties),1)
+ apen = mat(a.penalties).reshape(len(a.penalties),1)
+ features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
+ AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+
+ # if the pathNr-best alignment is very close to the true alignment consider it as true
+ if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
+ true_map[pathNr+1] = 1
+
+ up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+ #evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
+ #print up_off,down_off
+
+ if up_off > -1:
+ total_up_off.append(up_off)
+ total_down_off.append(down_off)
+
+ total_up = 0
+ total_down = 0
+ for idx in range(len(total_up_off)):
+ total_up += total_up_off[idx]
+ total_down += total_down_off[idx]
+
+ total_up /= len(total_up_off)
+ total_down /= len(total_down_off)
+
+ print 'Mean up_off is %f' % total_up
+ print 'Mean down_off is %f' % total_down
+ print 'Correct up_off len is %d' % len([elem for elem in total_up_off if elem in [0,1]])
+ print 'Correct down_off len is %d' % len([elem for elem in total_down_off if elem in [0,1]])
+
+ print 'Prediction completed'
+ self.logfh.close()
+
+def fetch_genome_info(ginfo_filename):
+ if not os.path.exists(ginfo_filename):
+ cmd = ['']*4
+ cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
+ cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
+ cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
+ cmd[3] = 'save genome_info.mat genome_info'
+ full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
+
+ obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+ out,err = obj.communicate()
+ assert err == '', 'An error occured!\n%s'%err
+
+ ginfo = scipy.io.loadmat('genome_info.mat')
+ cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
+ return ginfo['genome_info']
+
+ else:
+ return cPickle.load(open(ginfo_filename))
+
+def plifs2param(h,d,a,mmatrix,qualityPlifs):
+ donSP = Conf.numDonSuppPoints
+ accSP = Conf.numAccSuppPoints
+ lengthSP = Conf.numLengthSuppPoints
+ mmatrixSP = Conf.sizeMatchmatrix[0]\
+ *Conf.sizeMatchmatrix[1]
+
+
+ param = zeros((Conf.numFeatures,1))
+ param[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
+ param[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
+ param[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
+ param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
+
+ for idx in range(len(qualityPlifs)):
+ currentPlif = qualityPlifs[idx]
+ begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
+ end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
+ param[begin:end] = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
+
+ return param
+
+def load_param(filename):
+ param = None
+ #try:
+ # para = cPickle.load(open(filename))
+ #except:
+ # print 'Error: Could not open parameter file!'
+ # sys.exit(1)
+ #
+ #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
+
+ param = cPickle.load(open(filename))
+ return param
+
+def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
+ newExons = []
+ oldElem = -1
+ SpliceAlign = SpliceAlign.flatten().tolist()[0]
+ SpliceAlign.append(-1)
+ for pos,elem in enumerate(SpliceAlign):
+ if pos == 0:
+ oldElem = -1
+ else:
+ oldElem = SpliceAlign[pos-1]
+
+ if oldElem != 0 and elem == 0: # start of exon
+ newExons.append(pos-1)
+
+ if oldElem == 0 and elem != 0: # end of exon
+ newExons.append(pos)
+
+ up_off = -1
+ down_off = -1
+
+ if len(newExons) != 4:
+ acc = 0.0
+ else:
+ e1_begin,e1_end = newExons[0],newExons[1]
+ e2_begin,e2_end = newExons[2],newExons[3]
+
+ up_off = int(math.fabs(e1_end - exons[0,1]))
+ down_off = int(math.fabs(e2_begin - exons[1,0]))
+
+ #pdb.set_trace()
+
+ return up_off,down_off
+
+if __name__ == '__main__':
+
+ qpalma = QPalma()
+ if len(sys.argv) != 2:
+ print 'You have to choose between training or prediction mode:'
+ print 'python qpalma. py (train|predict)'
+
+ mode = sys.argv[1]
+ assert mode in ['train','predict']
+
+ if mode == 'train':
+ qpalma.train()
+
+ if mode == 'predict':
+ qpalma.predict()
+
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-###########################################################
-#
-#
-#
-###########################################################
-
-import sys
-import subprocess
-import scipy.io
-import pdb
-import os.path
-
-from numpy.matlib import mat,zeros,ones,inf
-from numpy.linalg import norm
-
-import QPalmaDP
-
-import qpalma
-from qpalma.SIQP_CPX import SIQPSolver
-from qpalma.DataProc import *
-
-from qpalma.generateEvaluationData import *
-from qpalma.computeSpliceWeights import *
-from qpalma.set_param_palma import *
-from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
-from qpalma.TrainingParam import Param
-from qpalma.export_param import *
-
-import qpalma.Configuration
-from qpalma.Plif import Plf
-from qpalma.Helpers import *
-
-from qpalma.tools.splicesites import getDonAccScores
-
-def getQualityFeatureCounts(qualityPlifs):
- weightQuality = qualityPlifs[0].penalties
- for currentPlif in qualityPlifs[1:]:
- weightQuality = numpy.vstack([weightQuality, currentPlif.penalties])
-
- return weightQuality
-
-class QPalma:
- """
- A training method for the QPalma project
- """
-
- def __init__(self):
- self.ARGS = Param()
- self.logfh = open('qpalma.log','w+')
-
- #gen_file= '%s/genome.config' % self.ARGS.basedir
- #ginfo_filename = 'genome_info.pickle'
- #self.genome_info = fetch_genome_info(ginfo_filename)
- #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
-
- #self.ARGS.train_with_splicesitescoreinformation = False
-
- def plog(self,string):
- self.logfh.write(string)
- self.logfh.flush()
-
- def predict(self):
- # Load the whole dataset
- if Configuration.mode == 'normal':
- #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
- Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
- use_quality_scores = False
-
- elif Confguration.mode == 'using_quality_scores':
- Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos =\
- paths_load_data_solexa('training',None,self.ARGS,True)
-
- begin = 0
- end = 1000
- Sequences = Sequences[begin:end]
- Exons = Exons[begin:end]
- Ests = Ests[begin:end]
- Qualities = Qualities[begin:end]
- Acceptors = Acceptors[begin:end]
- Donors = Donors[begin:end]
-
- pdb.set_trace()
-
- Donors, Acceptors = getDonAccScores(Sequences)
-
-
- #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
- #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
- #Qualities = []
- #for i in range(len(Ests)):
- # Qualities.append([40]*len(Ests[i]))
- use_quality_scores = True
- else:
- assert(False)
-
- # number of training instances
- N = len(Sequences)
- self.numExamples = N
- assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
- and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
- self.plog('Number of training examples: %d\n'% N)
- print 'Number of features: %d\n'% Conf.numFeatures
-
- remove_duplicate_scores = Conf.remove_duplicate_scores
- print_matrix = Conf.print_matrix
- anzpath = Conf.anzpath
-
- # Initialize parameter vector / param = numpy.matlib.rand(126,1)
- #param_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/elegans.param'
- param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_28.pickle'
- param = load_param(param_filename)
-
- # Set the parameters such as limits penalties for the Plifs
- [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
- # stores the number of alignments done for each example (best path, second-best path etc.)
- num_path = [anzpath]*N
-
- #############################################################################################
- # Training
- #############################################################################################
- self.plog('Starting training...\n')
-
- donSP = Conf.numDonSuppPoints
- accSP = Conf.numAccSuppPoints
- lengthSP = Conf.numLengthSuppPoints
- mmatrixSP = Conf.sizeMatchmatrix[0]\
- *Conf.sizeMatchmatrix[1]
- numq = Conf.numQualSuppPoints
- totalQualSP = Conf.totalQualSuppPoints
-
- currentPhi = zeros((Conf.numFeatures,1))
- totalQualityPenalties = zeros((totalQualSP,1))
-
- total_up_off = []
- total_down_off = []
-
- #for exampleIdx in range(self.numExamples):
- for exampleIdx in range(200):
-
- dna = Sequences[exampleIdx]
- est = Ests[exampleIdx]
- currentSplitPos = SplitPos[exampleIdx]
-
- if Conf.mode == 'normal':
- quality = [40]*len(est)
-
- if Conf.mode == 'using_quality_scores':
- quality = Qualities[exampleIdx]
-
- exons = Exons[exampleIdx]
- # NoiseMatrix = Noises[exampleIdx]
- don_supp = Donors[exampleIdx]
- acc_supp = Acceptors[exampleIdx]
-
- if exons[-1,1] > len(dna):
- continue
-
- # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
- trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-
- # Calculate the weights
- trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
- trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-
- currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
- currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
- currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
- currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
-
- if Conf.mode == 'using_quality_scores':
- totalQualityPenalties = param[-totalQualSP:]
- currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
-
- # Calculate w'phi(x,y) the total score of the alignment
- trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
-
- # The allWeights vector is supposed to store the weight parameter
- # of the true alignment as well as the weight parameters of the
- # 1 other alignments
- allWeights = zeros((Conf.numFeatures,1+1))
- allWeights[:,0] = trueWeight[:,0]
-
- AlignmentScores = [0.0]*(1+1)
- AlignmentScores[0] = trueAlignmentScore
-
- ################## Calculate wrong alignment(s) ######################
-
- # Compute donor, acceptor with penalty_lookup_new
- # returns two double lists
- donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
- #myalign wants the acceptor site on the g of the ag
- acceptor = acceptor[1:]
- acceptor.append(-inf)
-
- # for now we don't use donor/acceptor scores
- #donor = [-inf] * len(donor)
- #acceptor = [-inf] * len(donor)
-
- dna = str(dna)
- est = str(est)
- dna_len = len(dna)
- est_len = len(est)
-
- ps = h.convert2SWIG()
-
- prb = QPalmaDP.createDoubleArrayFromList(quality)
- chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
- matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
- mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
-
- d_len = len(donor)
- donor = QPalmaDP.createDoubleArrayFromList(donor)
- a_len = len(acceptor)
- acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
- # Create the alignment object representing the interface to the C/C++ code.
- currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, use_quality_scores)
-
- c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
-
- # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
- currentAlignment.myalign(1, dna, dna_len,\
- est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
- acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
- print_matrix)
-
- c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
- c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
- c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
- c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
-
- c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
-
- currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
- c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
-
- newSpliceAlign = zeros((dna_len,1))
- newEstAlign = zeros((est_len,1))
- newWeightMatch = zeros((mm_len,1))
- newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
-
- #print 'newSpliceAlign'
- for i in range(dna_len):
- newSpliceAlign[i] = c_SpliceAlign[i]
- # print '%f' % (spliceAlign[i])
-
- #print 'newEstAlign'
- for i in range(est_len):
- newEstAlign[i] = c_EstAlign[i]
- # print '%f' % (spliceAlign[i])
-
- #print 'weightMatch'
- for i in range(mm_len):
- newWeightMatch[i] = c_WeightMatch[i]
- # print '%f' % (weightMatch[i])
-
- newDPScores = c_DPScores[0]
-
- for i in range(Conf.totalQualSuppPoints):
- newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
-
- # equals palma up to here
-
- #print "Calling destructors"
- del c_SpliceAlign
- del c_EstAlign
- del c_WeightMatch
- del c_DPScores
- del c_qualityPlifsFeatures
- del currentAlignment
-
- newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
- newWeightMatch = newWeightMatch.reshape(1,mm_len)
- # Calculate weights of the respective alignments Note that we are
- # calculating n-best alignments without hamming loss, so we
- # have to keep track which of the n-best alignments correspond to
- # the true one in order not to incorporate a true alignment in the
- # constraints. To keep track of the true and false alignments we
- # define an array true_map with a boolean indicating the
- # equivalence to the true alignment for each decoded alignment.
- true_map = [0]*2
- true_map[0] = 1
- pathNr = 0
-
- weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
-
- decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
- for qidx in range(Conf.totalQualSuppPoints):
- decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
-
- # Gewichte in restliche Zeilen der Matrix speichern
- wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
- allWeights[:,pathNr+1] = wp
-
- hpen = mat(h.penalties).reshape(len(h.penalties),1)
- dpen = mat(d.penalties).reshape(len(d.penalties),1)
- apen = mat(a.penalties).reshape(len(a.penalties),1)
- features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
-
- AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
-
- # Check wether scalar product + loss equals viterbi score
- print '%f vs. %f' % (newDPScores, AlignmentScores[pathNr+1])
-
- # if the pathNr-best alignment is very close to the true alignment consider it as true
- if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
- true_map[pathNr+1] = 1
-
- #pdb.set_trace()
-
- up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
- #evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
- #print up_off,down_off
-
- if up_off > -1:
- total_up_off.append(up_off)
- total_down_off.append(down_off)
-
- total_up = 0
- total_down = 0
- for idx in range(len(total_up_off)):
- total_up += total_up_off[idx]
- total_down += total_down_off[idx]
-
- total_up /= len(total_up_off)
- total_down /= len(total_down_off)
-
- print 'Mean up_off is %f' % total_up
- print 'Mean down_off is %f' % total_down
- print 'Correct up_off len is %d' % len([elem for elem in total_up_off if elem in [0,1]])
- print 'Correct down_off len is %d' % len([elem for elem in total_down_off if elem in [0,1]])
- #print total_up_off
- #print total_down_off
-
- print 'Prediction completed'
- self.logfh.close()
-
-def fetch_genome_info(ginfo_filename):
- if not os.path.exists(ginfo_filename):
- cmd = ['']*4
- cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
- cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
- cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
- cmd[3] = 'save genome_info.mat genome_info'
- full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
-
- obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
- out,err = obj.communicate()
- assert err == '', 'An error occured!\n%s'%err
-
- ginfo = scipy.io.loadmat('genome_info.mat')
- cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
- return ginfo['genome_info']
-
- else:
- return cPickle.load(open(ginfo_filename))
-
-def plifs2param(h,d,a,mmatrix,qualityPlifs):
- donSP = Conf.numDonSuppPoints
- accSP = Conf.numAccSuppPoints
- lengthSP = Conf.numLengthSuppPoints
- mmatrixSP = Conf.sizeMatchmatrix[0]\
- *Conf.sizeMatchmatrix[1]
-
-
- param = zeros((Conf.numFeatures,1))
- param[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
- param[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
- param[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
- param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
-
- for idx in range(len(qualityPlifs)):
- currentPlif = qualityPlifs[idx]
- begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
- end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
- param[begin:end] = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
-
- return param
-
-def load_param(filename):
- param = None
- #try:
- # para = cPickle.load(open(filename))
- #except:
- # print 'Error: Could not open parameter file!'
- # sys.exit(1)
- #
- #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
-
- param = cPickle.load(open(filename))
- return param
-
-def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
- newExons = []
- oldElem = -1
- SpliceAlign = SpliceAlign.flatten().tolist()[0]
- SpliceAlign.append(-1)
- for pos,elem in enumerate(SpliceAlign):
- if pos == 0:
- oldElem = -1
- else:
- oldElem = SpliceAlign[pos-1]
-
- if oldElem != 0 and elem == 0: # start of exon
- newExons.append(pos-1)
-
- if oldElem == 0 and elem != 0: # end of exon
- newExons.append(pos)
-
- up_off = -1
- down_off = -1
-
- if len(newExons) != 4:
- acc = 0.0
- else:
- e1_begin,e1_end = newExons[0],newExons[1]
- e2_begin,e2_end = newExons[2],newExons[3]
-
- up_off = int(math.fabs(e1_end - exons[0,1]))
- down_off = int(math.fabs(e2_begin - exons[1,0]))
-
- #pdb.set_trace()
-
- return up_off,down_off
-
-if __name__ == '__main__':
- qpalma = QPalma()
- qpalma.run()
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-###########################################################
-#
-# The QPalma project aims at extending the Palma project
-# to be able to use Solexa reads toegether with their
-# quality scores.
-#
-# This file represents the conversion of the main matlab
-# training loop for Palma to python.
-#
-# Author: Fabio De Bona
-#
-###########################################################
-
-import sys
-import subprocess
-import scipy.io
-import pdb
-import os.path
-
-from numpy.matlib import mat,zeros,ones,inf
-from numpy.linalg import norm
-
-import QPalmaDP
-
-import qpalma
-import qpalma.Configuration
-
-from qpalma.SIQP_CPX import SIQPSolver
-from qpalma.DataProc import *
-from qpalma.generateEvaluationData import *
-from qpalma.computeSpliceWeights import *
-from qpalma.set_param_palma import *
-from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.penalty_lookup_new import *
-from qpalma.compute_donacc import *
-from qpalma.export_param import *
-from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf
-
-from qpalma.tools.splicesites import getDonAccScores
-
-class para:
- pass
-
-class QPalma:
- """
- A training method for the QPalma project
- """
-
- def __init__(self):
- self.ARGS = Param()
- self.logfh = open('qpalma.log','w+')
-
- #gen_file= '%s/genome.config' % self.ARGS.basedir
- #ginfo_filename = 'genome_info.pickle'
- #self.genome_info = fetch_genome_info(ginfo_filename)
- #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
-
- #self.ARGS.train_with_splicesitescoreinformation = False
-
- # Load the whole dataset
- if Configuration.mode == 'normal':
- #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
- Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
- self.use_quality_scores = False
-
- elif Configuration.mode == 'using_quality_scores':
-
- filename = 'real_dset_%s'% 'recent'
- if True:# not os.path.exists(filename):
- Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
-
- end = 1000
- self.Sequences = Sequences[:end]
- self.Exons = Exons[:end]
- self.Ests = Ests[:end]
- self.Qualities = Qualities[:end]
- self.Acceptors = Acceptors[:end]
- self.Donors = Donors[:end]
-
- self.Donors, self.Acceptors = getDonAccScores(Sequences)
- #dset = Dataset()
- else:
- dset = cPickle.load(open(filename))
- Sequences = dset.Sequences
- Acceptors = dset.Acceptors
- Donors = dset.Donors
- Exons = dset.Exons
- Ests = dset.Ests
- Qualities = dset.Qualities
-
- #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
- #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
- #Qualities = []
- #for i in range(len(Ests)):
- # Qualities.append([40]*len(Ests[i]))
- self.use_quality_scores = True
- else:
- assert(False)
-
- # number of training instances
- N = len(Sequences)
- self.numExamples = N
- assert N == len(Acceptors) and N == len(Acceptors) and N == len(Exons)\
- and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
- self.plog('Number of training examples: %d\n'% N)
- print 'Number of features: %d\n'% Configuration.numFeatures
-
- def plog(self,string):
- self.logfh.write(string)
- self.logfh.flush()
-
- def train(self):
- beg = Configuration.training_begin
- end = Configuration.training_end
-
- Sequences = self.Sequences[beg:end]
- Exons = self.Exons[beg:end]
- Ests = self.Ests[beg:end]
- Qualities = self.Qualities[beg:end]
- Acceptors = self.Acceptors[beg:end]
- Donors = self.Donors[beg:end]
-
- iteration_steps = Configuration.iter_steps ; #upper bound on iteration steps
- remove_duplicate_scores = Configuration.remove_duplicate_scores
- print_matrix = Configuration.print_matrix
- anzpath = Configuration.anzpath
-
- # Initialize parameter vector / param = numpy.matlib.rand(126,1)
- param = Configuration.fixedParam
-
- # Set the parameters such as limits penalties for the Plifs
- [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
- # delete splicesite-score-information
- #if not self.ARGS.train_with_splicesitescoreinformation:
- # for i in range(len(Acceptors)):
- # if Acceptors[i] > -20:
- # Acceptors[i] = 1
- # if Donors[i] >-20:
- # Donors[i] = 1
-
- # Initialize solver
- if Configuration.USE_OPT:
- self.plog('Initializing problem...\n')
- solver = SIQPSolver(Configuration.numFeatures,self.numExamples,Configuration.C,self.logfh)
-
- # stores the number of alignments done for each example (best path, second-best path etc.)
- num_path = [anzpath]*self.numExamples
- # stores the gap for each example
- gap = [0.0]*self.numExamples
-
- #############################################################################################
- # Training
- #############################################################################################
- self.plog('Starting training...\n')
-
- donSP = Configuration.numDonSuppPoints
- accSP = Configuration.numAccSuppPoints
- lengthSP = Configuration.numLengthSuppPoints
- mmatrixSP = Configuration.sizeMatchmatrix[0]\
- *Configuration.sizeMatchmatrix[1]
- numq = Configuration.numQualSuppPoints
- totalQualSP = Configuration.totalQualSuppPoints
-
- currentPhi = zeros((Configuration.numFeatures,1))
- totalQualityPenalties = zeros((totalQualSP,1))
-
- iteration_nr = 0
- param_idx = 0
- const_added_ctr = 0
- while True:
- if iteration_nr == iteration_steps:
- break
-
- for exampleIdx in range(self.numExamples):
- if (exampleIdx%100) == 0:
- print 'Current example nr %d' % exampleIdx
-
- dna = Sequences[exampleIdx]
- est = Ests[exampleIdx]
-
- if Configuration.mode == 'normal':
- quality = [40]*len(est)
-
- if Configuration.mode == 'using_quality_scores':
- quality = Qualities[exampleIdx]
-
- exons = Exons[exampleIdx]
- # NoiseMatrix = Noises[exampleIdx]
- don_supp = Donors[exampleIdx]
- acc_supp = Acceptors[exampleIdx]
-
- # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
- trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-
- #print 'trueWeights'
- # Calculate the weights
- trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
- trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-
- currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
- currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
- currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
- currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
-
- #pdb.set_trace()
- if Configuration.mode == 'using_quality_scores':
- totalQualityPenalties = param[-totalQualSP:]
- currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
-
- # Calculate w'phi(x,y) the total score of the alignment
- trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
-
- # The allWeights vector is supposed to store the weight parameter
- # of the true alignment as well as the weight parameters of the
- # num_path[exampleIdx] other alignments
- allWeights = zeros((Configuration.numFeatures,num_path[exampleIdx]+1))
- allWeights[:,0] = trueWeight[:,0]
-
- AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
- AlignmentScores[0] = trueAlignmentScore
-
- ################## Calculate wrong alignment(s) ######################
-
- # Compute donor, acceptor with penalty_lookup_new
- # returns two double lists
- donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
- #myalign wants the acceptor site on the g of the ag
- acceptor = acceptor[1:]
- acceptor.append(-inf)
-
- # for now we don't use donor/acceptor scores
-
- #donor = [-inf] * len(donor)
- #acceptor = [-inf] * len(donor)
-
- dna = str(dna)
- est = str(est)
- dna_len = len(dna)
- est_len = len(est)
-
- ps = h.convert2SWIG()
-
- prb = QPalmaDP.createDoubleArrayFromList(quality)
- chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
- matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
- mm_len = Configuration.sizeMatchmatrix[0]*Configuration.sizeMatchmatrix[1]
-
- d_len = len(donor)
- donor = QPalmaDP.createDoubleArrayFromList(donor)
- a_len = len(acceptor)
- acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
- # Create the alignment object representing the interface to the C/C++ code.
- currentAlignment = QPalmaDP.Alignment(Configuration.numQualPlifs,Configuration.numQualSuppPoints, self.use_quality_scores)
-
- c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
- #print 'Calling myalign...'
- # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
- currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
- est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
- acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
- print_matrix)
-
- #print 'After calling myalign...'
- #print 'Calling getAlignmentResults...'
-
- c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
- c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*num_path[exampleIdx]))
- c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*num_path[exampleIdx]))
- c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
-
- c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Configuration.totalQualSuppPoints*num_path[exampleIdx]))
-
- currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
- c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
-
- #print 'After calling getAlignmentResults...'
-
- newSpliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
- newEstAlign = zeros((est_len*num_path[exampleIdx],1))
- newWeightMatch = zeros((num_path[exampleIdx]*mm_len,1))
- newDPScores = zeros((num_path[exampleIdx],1))
- newQualityPlifsFeatures = zeros((Configuration.totalQualSuppPoints*num_path[exampleIdx],1))
-
- #print 'newSpliceAlign'
- for i in range(dna_len*num_path[exampleIdx]):
- newSpliceAlign[i] = c_SpliceAlign[i]
- # print '%f' % (spliceAlign[i])
-
- #print 'newEstAlign'
- for i in range(est_len*num_path[exampleIdx]):
- newEstAlign[i] = c_EstAlign[i]
- # print '%f' % (spliceAlign[i])
-
- #print 'weightMatch'
- for i in range(mm_len*num_path[exampleIdx]):
- newWeightMatch[i] = c_WeightMatch[i]
- # print '%f' % (weightMatch[i])
-
- #print 'ViterbiScores'
- for i in range(num_path[exampleIdx]):
- newDPScores[i] = c_DPScores[i]
-
-
- if self.use_quality_scores:
- for i in range(Configuration.totalQualSuppPoints*num_path[exampleIdx]):
- newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
-
- # equals palma up to here
-
- #print "Calling destructors"
- del c_SpliceAlign
- del c_EstAlign
- del c_WeightMatch
- del c_DPScores
- del c_qualityPlifsFeatures
- del currentAlignment
-
- newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
- newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
- # Calculate weights of the respective alignments Note that we are
- # calculating n-best alignments without hamming loss, so we
- # have to keep track which of the n-best alignments correspond to
- # the true one in order not to incorporate a true alignment in the
- # constraints. To keep track of the true and false alignments we
- # define an array true_map with a boolean indicating the
- # equivalence to the true alignment for each decoded alignment.
- true_map = [0]*(num_path[exampleIdx]+1)
- true_map[0] = 1
- path_loss = [0]*(num_path[exampleIdx])
-
- for pathNr in range(num_path[exampleIdx]):
- #print 'decodedWeights'
- weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
- h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
- acc_supp)
-
- decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
- for qidx in range(Configuration.totalQualSuppPoints):
- decodedQualityFeatures[qidx] = newQualityPlifsFeatures[(pathNr*Configuration.totalQualSuppPoints)+qidx]
-
- #pdb.set_trace()
-
- path_loss[pathNr] = 0
- # sum up positionwise loss between alignments
- for alignPosIdx in range(newSpliceAlign[pathNr,:].shape[1]):
- if newSpliceAlign[pathNr,alignPosIdx] != trueSpliceAlign[alignPosIdx]:
- path_loss[pathNr] += 1
-
- #pdb.set_trace()
-
- # Gewichte in restliche Zeilen der Matrix speichern
- wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
- allWeights[:,pathNr+1] = wp
-
- hpen = mat(h.penalties).reshape(len(h.penalties),1)
- dpen = mat(d.penalties).reshape(len(d.penalties),1)
- apen = mat(a.penalties).reshape(len(a.penalties),1)
- features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
-
- AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
-
- # Check wether scalar product + loss equals viterbi score
- #print 'Example nr.: %d, path nr. %d, scores: %f vs %f' % (exampleIdx,pathNr,newDPScores[pathNr,0], AlignmentScores[pathNr+1])
-
- distinct_scores = False
- if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
- distinct_scores = True
-
- #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
- if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
- pdb.set_trace()
-
- # # if the pathNr-best alignment is very close to the true alignment consider it as true
- if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
- true_map[pathNr+1] = 1
-
- # assert AlignmentScores[0] > max(AlignmentScores[1:]) + 1e-6, pdb.set_trace()
-
- # the true label sequence should not have a larger score than the maximal one WHYYYYY?
- # this means that all n-best paths are to close to each other
- # we have to extend the n-best search to a (n+1)-best
- if len([elem for elem in true_map if elem == 1]) == len(true_map):
- num_path[exampleIdx] = num_path[exampleIdx]+1
-
- # Choose true and first false alignment for extending A
- firstFalseIdx = -1
- for map_idx,elem in enumerate(true_map):
- if elem == 0:
- firstFalseIdx = map_idx
- break
-
- # if there is at least one useful false alignment add the
- # corresponding constraints to the optimization problem
- if firstFalseIdx != -1:
- trueWeights = allWeights[:,0]
- firstFalseWeights = allWeights[:,firstFalseIdx]
- differenceVector = trueWeights - firstFalseWeights
- #pdb.set_trace()
-
- if Configuration.USE_OPT:
- const_added = solver.addConstraint(differenceVector, exampleIdx)
- const_added_ctr += 1
- #
- # end of one example processing
- #
-
- # call solver every nth example //added constraint
- if exampleIdx != 0 and exampleIdx % 100 == 0 and Configuration.USE_OPT:
- objValue,w,self.slacks = solver.solve()
-
- print "objValue is %f" % objValue
-
- sum_xis = 0
- for elem in self.slacks:
- sum_xis += elem
-
- for i in range(len(param)):
- param[i] = w[i]
-
- #pdb.set_trace()
- cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
- param_idx += 1
- [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
- #
- # end of one iteration through all examples
- #
- iteration_nr += 1
-
- #
- # end of optimization
- #
- print 'Training completed'
-
- pa = para()
- pa.h = h
- pa.d = d
- pa.a = a
- pa.mmatrix = mmatrix
- pa.qualityPlifs = qualityPlifs
-
- cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
- #cPickle.dump(pa,open('elegans.param','w+'))
- self.logfh.close()
-
-
-
- def predict(self):
- beg = Configuration.prediction_begin
- end = Configuration.prediction_end
-
- Sequences = self.Sequences[beg:end]
- Exons = self.Exons[beg:end]
- Ests = self.Ests[beg:end]
- Qualities = self.Qualities[beg:end]
- Acceptors = self.Acceptors[beg:end]
- Donors = self.Donors[beg:end]
-
- remove_duplicate_scores = Conf.remove_duplicate_scores
- print_matrix = Conf.print_matrix
- anzpath = Conf.anzpath
-
- param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_28.pickle'
- param = load_param(param_filename)
-
- # Set the parameters such as limits penalties for the Plifs
- [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
- #############################################################################################
- # Prediction
- #############################################################################################
- self.plog('Starting prediction...\n')
-
- donSP = Conf.numDonSuppPoints
- accSP = Conf.numAccSuppPoints
- lengthSP = Conf.numLengthSuppPoints
- mmatrixSP = Conf.sizeMatchmatrix[0]\
- *Conf.sizeMatchmatrix[1]
- numq = Conf.numQualSuppPoints
- totalQualSP = Conf.totalQualSuppPoints
-
- currentPhi = zeros((Conf.numFeatures,1))
- totalQualityPenalties = zeros((totalQualSP,1))
-
- total_up_off = []
- total_down_off = []
-
- for exampleIdx in range(self.numExamples):
- dna = Sequences[exampleIdx]
- est = Ests[exampleIdx]
- currentSplitPos = SplitPos[exampleIdx]
-
- if Conf.mode == 'normal':
- quality = [40]*len(est)
-
- if Conf.mode == 'using_quality_scores':
- quality = Qualities[exampleIdx]
-
- exons = Exons[exampleIdx]
- # NoiseMatrix = Noises[exampleIdx]
- don_supp = Donors[exampleIdx]
- acc_supp = Acceptors[exampleIdx]
-
- if exons[-1,1] > len(dna):
- continue
-
- # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
- trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-
- # Calculate the weights
- trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
- trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-
- currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
- currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
- currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
- currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
-
- if Conf.mode == 'using_quality_scores':
- totalQualityPenalties = param[-totalQualSP:]
- currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
-
- # Calculate w'phi(x,y) the total score of the alignment
- trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
-
- # The allWeights vector is supposed to store the weight parameter
- # of the true alignment as well as the weight parameters of the
- # 1 other alignments
- allWeights = zeros((Conf.numFeatures,1+1))
- allWeights[:,0] = trueWeight[:,0]
-
- AlignmentScores = [0.0]*(1+1)
- AlignmentScores[0] = trueAlignmentScore
-
- ################## Calculate wrong alignment(s) ######################
-
- # Compute donor, acceptor with penalty_lookup_new
- # returns two double lists
- donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
- #myalign wants the acceptor site on the g of the ag
- acceptor = acceptor[1:]
- acceptor.append(-inf)
-
- dna = str(dna)
- est = str(est)
- dna_len = len(dna)
- est_len = len(est)
-
- ps = h.convert2SWIG()
-
- prb = QPalmaDP.createDoubleArrayFromList(quality)
- chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
- matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
- mm_len = Conf.sizeMatchmatrix[0]*Conf.sizeMatchmatrix[1]
-
- d_len = len(donor)
- donor = QPalmaDP.createDoubleArrayFromList(donor)
- a_len = len(acceptor)
- acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
- # Create the alignment object representing the interface to the C/C++ code.
- currentAlignment = QPalmaDP.Alignment(Conf.numQualPlifs,Conf.numQualSuppPoints, self.use_quality_scores)
-
- c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
-
- # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
- currentAlignment.myalign(1, dna, dna_len,\
- est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
- acceptor, a_len, c_qualityPlifs, remove_duplicate_scores,
- print_matrix)
-
- c_SpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*1))
- c_EstAlign = QPalmaDP.createIntArrayFromList([0]*(est_len*1))
- c_WeightMatch = QPalmaDP.createIntArrayFromList([0]*(mm_len*1))
- c_DPScores = QPalmaDP.createDoubleArrayFromList([.0]*1)
-
- c_qualityPlifsFeatures = QPalmaDP.createDoubleArrayFromList([.0]*(Conf.totalQualSuppPoints))
-
- currentAlignment.getAlignmentResults(c_SpliceAlign, c_EstAlign,\
- c_WeightMatch, c_DPScores, c_qualityPlifsFeatures)
-
- newSpliceAlign = zeros((dna_len,1))
- newEstAlign = zeros((est_len,1))
- newWeightMatch = zeros((mm_len,1))
- newQualityPlifsFeatures = zeros((Conf.totalQualSuppPoints,1))
-
- #print 'newSpliceAlign'
- for i in range(dna_len):
- newSpliceAlign[i] = c_SpliceAlign[i]
- # print '%f' % (spliceAlign[i])
-
- #print 'newEstAlign'
- for i in range(est_len):
- newEstAlign[i] = c_EstAlign[i]
- # print '%f' % (spliceAlign[i])
-
- #print 'weightMatch'
- for i in range(mm_len):
- newWeightMatch[i] = c_WeightMatch[i]
- # print '%f' % (weightMatch[i])
-
- newDPScores = c_DPScores[0]
-
- for i in range(Conf.totalQualSuppPoints):
- newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
-
- #print "Calling destructors"
- del c_SpliceAlign
- del c_EstAlign
- del c_WeightMatch
- del c_DPScores
- del c_qualityPlifsFeatures
- del currentAlignment
-
- newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
- newWeightMatch = newWeightMatch.reshape(1,mm_len)
- # Calculate weights of the respective alignments Note that we are
- # calculating n-best alignments without hamming loss, so we
- # have to keep track which of the n-best alignments correspond to
- # the true one in order not to incorporate a true alignment in the
- # constraints. To keep track of the true and false alignments we
- # define an array true_map with a boolean indicating the
- # equivalence to the true alignment for each decoded alignment.
- true_map = [0]*2
- true_map[0] = 1
- pathNr = 0
-
- weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a, h, newSpliceAlign.flatten().tolist()[0], don_supp, acc_supp)
-
- decodedQualityFeatures = zeros((Conf.totalQualSuppPoints,1))
- for qidx in range(Conf.totalQualSuppPoints):
- decodedQualityFeatures[qidx] = newQualityPlifsFeatures[qidx]
-
- # Gewichte in restliche Zeilen der Matrix speichern
- wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch.T, decodedQualityFeatures])
- allWeights[:,pathNr+1] = wp
-
- hpen = mat(h.penalties).reshape(len(h.penalties),1)
- dpen = mat(d.penalties).reshape(len(d.penalties),1)
- apen = mat(a.penalties).reshape(len(a.penalties),1)
- features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties])
- AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
-
- # if the pathNr-best alignment is very close to the true alignment consider it as true
- if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
- true_map[pathNr+1] = 1
-
- up_off,down_off = evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
- #evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos)
- #print up_off,down_off
-
- if up_off > -1:
- total_up_off.append(up_off)
- total_down_off.append(down_off)
-
- total_up = 0
- total_down = 0
- for idx in range(len(total_up_off)):
- total_up += total_up_off[idx]
- total_down += total_down_off[idx]
-
- total_up /= len(total_up_off)
- total_down /= len(total_down_off)
-
- print 'Mean up_off is %f' % total_up
- print 'Mean down_off is %f' % total_down
- print 'Correct up_off len is %d' % len([elem for elem in total_up_off if elem in [0,1]])
- print 'Correct down_off len is %d' % len([elem for elem in total_down_off if elem in [0,1]])
-
- print 'Prediction completed'
- self.logfh.close()
-
-def fetch_genome_info(ginfo_filename):
- if not os.path.exists(ginfo_filename):
- cmd = ['']*4
- cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
- cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
- cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
- cmd[3] = 'save genome_info.mat genome_info'
- full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
-
- obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
- out,err = obj.communicate()
- assert err == '', 'An error occured!\n%s'%err
-
- ginfo = scipy.io.loadmat('genome_info.mat')
- cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
- return ginfo['genome_info']
-
- else:
- return cPickle.load(open(ginfo_filename))
-
-def plifs2param(h,d,a,mmatrix,qualityPlifs):
- donSP = Conf.numDonSuppPoints
- accSP = Conf.numAccSuppPoints
- lengthSP = Conf.numLengthSuppPoints
- mmatrixSP = Conf.sizeMatchmatrix[0]\
- *Conf.sizeMatchmatrix[1]
-
-
- param = zeros((Conf.numFeatures,1))
- param[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
- param[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
- param[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
- param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
-
- for idx in range(len(qualityPlifs)):
- currentPlif = qualityPlifs[idx]
- begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
- end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
- param[begin:end] = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
-
- return param
-
-def load_param(filename):
- param = None
- #try:
- # para = cPickle.load(open(filename))
- #except:
- # print 'Error: Could not open parameter file!'
- # sys.exit(1)
- #
- #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
-
- param = cPickle.load(open(filename))
- return param
-
-def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
- newExons = []
- oldElem = -1
- SpliceAlign = SpliceAlign.flatten().tolist()[0]
- SpliceAlign.append(-1)
- for pos,elem in enumerate(SpliceAlign):
- if pos == 0:
- oldElem = -1
- else:
- oldElem = SpliceAlign[pos-1]
-
- if oldElem != 0 and elem == 0: # start of exon
- newExons.append(pos-1)
-
- if oldElem == 0 and elem != 0: # end of exon
- newExons.append(pos)
-
- up_off = -1
- down_off = -1
-
- if len(newExons) != 4:
- acc = 0.0
- else:
- e1_begin,e1_end = newExons[0],newExons[1]
- e2_begin,e2_end = newExons[2],newExons[3]
-
- up_off = int(math.fabs(e1_end - exons[0,1]))
- down_off = int(math.fabs(e2_begin - exons[1,0]))
-
- #pdb.set_trace()
-
- return up_off,down_off
-
-if __name__ == '__main__':
-
- qpalma = QPalma()
- if len(sys.argv) != 2:
- print 'You have to choose between training or prediction mode:'
- print 'python qpalma. py (train|predict)'
-
- mode = sys.argv[1]
- assert mode in ['train','predict']
-
- if mode == 'train':
- qpalma.train()
-
- if mode == 'predict':
- qpalma.predict()
-