alignment step. Optimal alignment parameters i.e. scoring matrices are inferred
using a machine learning technique similar to \emph{Support Vector Machines}.
For further details on \QP itself consult the paper \cite{DeBona08}. For
-details about the learning method \cite{Tsochantaridis04}.
-%$SOLid$.
+details about the learning method see \cite{Tsochantaridis04}.
%We refer to the whole pipeline as the \QP pipeline and \QP respectively.
\section{Quicktour}
- pythongrid
- Genefinding doIntervalQuery
-
\begin{thebibliography}{1}
\bibitem[1]{DeBona08}
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Gunnar Rätsch, Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
+import cPickle
+import sys
+import pdb
+import os
+import os.path
+import resource
+
+from qpalma.computeSpliceWeights import *
+from qpalma.set_param_palma import *
+from qpalma.computeSpliceAlignWithQuality import *
+
+from numpy.matlib import mat,zeros,ones,inf
+from numpy import inf,mean
+
+from ParaParser import ParaParser,IN_VECTOR
+
+from qpalma.Lookup import LookupTable
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement,unbracket_seq
+
+
+def cpu():
+ return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
+ resource.getrusage(resource.RUSAGE_SELF).ru_stime)
+
+
+class BracketWrapper:
+ fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
+
+ def __init__(self,filename):
+ self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
+ self.parser.parseFile(filename)
+
+ def __len__(self):
+ return self.parser.getSize(IN_VECTOR)
+
+ def __getitem__(self,key):
+ return self.parser.fetchEntry(key)
+
+ def __iter__(self):
+ self.counter = 0
+ self.size = self.parser.getSize(IN_VECTOR)
+ return self
+
+ def next(self):
+ if not self.counter < self.size:
+ raise StopIteration
+ return_val = self.parser.fetchEntry(self.counter)
+ self.counter += 1
+ return return_val
+
+
+class PipelineHeuristic:
+ """
+ This class wraps the filter which decides whether an alignment found by
+ vmatch is spliced an should be then newly aligned using QPalma or not.
+ """
+
+ def __init__(self,run_fname,data_fname,param_fname,result_fname,settings):
+ """
+ We need a run object holding information about the nr. of support points
+ etc.
+ """
+
+ run = cPickle.load(open(run_fname))
+ self.run = run
+
+ start = cpu()
+ self.all_remapped_reads = BracketWrapper(data_fname)
+ stop = cpu()
+
+ print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
+
+ self.chromo_range = settings['allowed_fragments']
+
+ accessWrapper = DataAccessWrapper(settings)
+ seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+
+ start = cpu()
+ self.lt1 = LookupTable(settings)
+ stop = cpu()
+
+ print 'prefetched sequence and splice data in %f sec' % (stop-start)
+
+ self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
+ self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
+
+ start = cpu()
+
+ self.data_fname = data_fname
+
+ self.param = cPickle.load(open(param_fname))
+
+ # Set the parameters such as limits penalties for the Plifs
+ [h,d,a,mmatrix,qualityPlifs] = set_param_palma(self.param,True,run)
+
+ self.h = h
+ self.d = d
+ self.a = a
+ self.mmatrix = mmatrix
+ self.qualityPlifs = qualityPlifs
+
+ self.read_size = 38
+ self.prb_offset = 64
+ #self.prb_offset = 50
+
+ # parameters of the heuristics to decide whether the read is spliced
+ #self.splice_thresh = 0.005
+ self.splice_thresh = 0.01
+ self.max_intron_size = 2000
+ self.max_mismatch = 2
+ #self.splice_stop_thresh = 0.99
+ self.splice_stop_thresh = 0.8
+ self.spliced_bias = 0.0
+
+ param_lines = [\
+ ('%f','splice_thresh',self.splice_thresh),\
+ ('%d','max_intron_size',self.max_intron_size),\
+ ('%d','max_mismatch',self.max_mismatch),\
+ ('%f','splice_stop_thresh',self.splice_stop_thresh),\
+ ('%f','spliced_bias',self.spliced_bias)]
+
+ param_lines = [('# %s: '+form+'\n')%(name,val) for form,name,val in param_lines]
+ param_lines.append('# data: %s\n'%self.data_fname)
+ param_lines.append('# param: %s\n'%param_fname)
+
+ #pdb.set_trace()
+
+ for p_line in param_lines:
+ self.result_spliced_fh.write(p_line)
+ self.result_unspliced_fh.write(p_line)
+
+ #self.original_reads = {}
+
+ # we do not have this information
+ #for line in open(reads_pipeline_fn):
+ # line = line.strip()
+ # id,seq,q1,q2,q3 = line.split()
+ # id = int(id)
+ # self.original_reads[id] = seq
+
+ lengthSP = run['numLengthSuppPoints']
+ donSP = run['numDonSuppPoints']
+ accSP = run['numAccSuppPoints']
+ mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
+ numq = run['numQualSuppPoints']
+ totalQualSP = run['totalQualSuppPoints']
+
+ currentPhi = zeros((run['numFeatures'],1))
+ currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
+ currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
+ currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
+ currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
+
+ totalQualityPenalties = self.param[-totalQualSP:]
+ currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
+ self.currentPhi = currentPhi
+
+ # we want to identify spliced reads
+ # so true pos are spliced reads that are predicted "spliced"
+ self.true_pos = 0
+
+ # as false positives we count all reads that are not spliced but predicted
+ # as "spliced"
+ self.false_pos = 0
+
+ self.true_neg = 0
+ self.false_neg = 0
+
+ # total time spend for get seq and scores
+ self.get_time = 0.0
+ self.calcAlignmentScoreTime = 0.0
+ self.alternativeScoresTime = 0.0
+
+ self.count_time = 0.0
+ self.main_loop = 0.0
+ self.splice_site_time = 0.0
+ self.computeSpliceAlignWithQualityTime = 0.0
+ self.computeSpliceWeightsTime = 0.0
+ self.DotProdTime = 0.0
+ self.array_stuff = 0.0
+ stop = cpu()
+
+ self.init_time = stop-start
+
+ def filter(self):
+ """
+ This method...
+ """
+ run = self.run
+
+ ctr = 0
+ unspliced_ctr = 0
+ spliced_ctr = 0
+
+ print 'Starting filtering...'
+ _start = cpu()
+
+ for location,original_line in self.all_remapped_reads:
+
+ if ctr % 1000 == 0:
+ print ctr
+
+ id = location['id']
+ chr = location['chr']
+ pos = location['pos']
+ strand = location['strand']
+ seq = location['seq']
+ prb = location['prb']
+
+ id = int(id)
+
+ seq = seq.lower()
+
+ strand_map = {'D':'+', 'P':'-'}
+
+ strand = strand_map[strand]
+
+ if not chr in self.chromo_range:
+ continue
+
+ unb_seq = unbracket_seq(seq)
+
+ # forgot to do this
+ if strand == '-':
+ unb_seq = reverse_complement(unb_seq)
+ seq = reverse_complement(seq)
+
+ effective_len = len(unb_seq)
+
+ genomicSeq_start = pos
+ #genomicSeq_stop = pos+effective_len-1
+ genomicSeq_stop = pos+effective_len
+
+ start = cpu()
+ #currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
+ currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop)
+
+ # just performed some tests to check LookupTable results
+ #assert currentDNASeq_ == currentDNASeq
+ #assert currentAcc_ == currentAcc_
+ #assert currentDon_ == currentDon_
+
+ stop = cpu()
+ self.get_time += stop-start
+
+ dna = currentDNASeq
+ exons = zeros((2,1))
+ exons[0,0] = 0
+ exons[1,0] = effective_len
+ est = unb_seq
+ original_est = seq
+ quality = map(lambda x:ord(x)-self.prb_offset,prb)
+
+ currentVMatchAlignment = dna, exons, est, original_est, quality,\
+ currentAcc, currentDon
+
+ #pdb.set_trace()
+
+ #alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+
+ try:
+ alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+ except:
+ alternativeAlignmentScores = []
+
+ if alternativeAlignmentScores == []:
+ # no alignment necessary
+ maxAlternativeAlignmentScore = -inf
+ vMatchScore = 0.0
+ else:
+ maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
+ # compute alignment for vmatch unspliced read
+ vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
+ #print 'vMatchScore/alternativeScore: %f %f %s' % (vMatchScore,maxAlternativeAlignmentScore,strand)
+
+
+ start = cpu()
+
+ #print 'all candidates %s' % str(alternativeAlignmentScores)
+
+ new_id = id - 1000000300000
+
+ unspliced = False
+ # unspliced
+ if new_id > 0:
+ unspliced = True
+
+ # Seems that according to our learned parameters VMatch found a good
+ # alignment of the current read
+ if maxAlternativeAlignmentScore < vMatchScore:
+ unspliced_ctr += 1
+
+ self.result_unspliced_fh.write(original_line+'\n')
+
+ if unspliced:
+ self.true_neg += 1
+ else:
+ self.false_neg += 1
+
+ # We found an alternative alignment considering splice sites that scores
+ # higher than the VMatch alignment
+ else:
+ spliced_ctr += 1
+
+ self.result_spliced_fh.write(original_line+'\n')
+
+ if unspliced:
+ self.false_pos += 1
+ else:
+ self.true_pos += 1
+
+ ctr += 1
+ stop = cpu()
+ self.count_time = stop-start
+
+ _stop = cpu()
+ self.main_loop = _stop-_start
+
+ print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
+ print 'True pos / false pos : %d %d'%(self.true_pos,self.false_pos)
+ print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)
+
+
+ def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):
+
+ def signum(a):
+ if a>0:
+ return 1
+ elif a<0:
+ return -1
+ else:
+ return 0
+
+ proximal_acc = []
+ for idx in xrange(max_intron_size, max_intron_size+read_size/2):
+ if currentAcc[idx]>= splice_thresh:
+ proximal_acc.append((idx,currentAcc[idx]))
+
+ proximal_acc.sort(lambda x,y: signum(x[1]-y[1]))
+ proximal_acc=proximal_acc[-2:]
+
+ distal_acc = []
+ for idx in xrange(max_intron_size+read_size, len(currentAcc)):
+ if currentAcc[idx]>= splice_thresh and idx+read_size<len(currentAcc):
+ distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
+
+ #distal_acc.sort(lambda x,y: signum(x[1]-y[1]))
+ #distal_acc=distal_acc[-2:]
+
+ proximal_don = []
+ for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
+ if currentDon[idx] >= splice_thresh:
+ proximal_don.append((idx, currentDon[idx]))
+
+ proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
+ proximal_don=proximal_don[-2:]
+
+ distal_don = []
+ for idx in xrange(1, max_intron_size):
+ if currentDon[idx] >= splice_thresh and idx>read_size:
+ distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
+
+ distal_don.sort(lambda x,y: y[0]-x[0])
+ #distal_don=distal_don[-2:]
+
+ return proximal_acc,proximal_don,distal_acc,distal_don
+
+
+ def calcAlternativeAlignments(self,location):
+ """
+ Given an alignment proposed by Vmatch this function calculates possible
+ alternative alignments taking into account for example matched
+ donor/acceptor positions.
+ """
+
+ run = self.run
+
+ id = location['id']
+ chr = location['chr']
+ pos = location['pos']
+ strand = location['strand']
+ original_est = location['seq']
+ quality = location['prb']
+ quality = map(lambda x:ord(x)-self.prb_offset,quality)
+
+ original_est = original_est.lower()
+ est = unbracket_seq(original_est)
+ effective_len = len(est)
+
+ genomicSeq_start = pos - self.max_intron_size
+ genomicSeq_stop = pos + self.max_intron_size + len(est)
+
+ strand_map = {'D':'+', 'P':'-'}
+ strand = strand_map[strand]
+
+ if strand == '-':
+ est = reverse_complement(est)
+ original_est = reverse_complement(original_est)
+
+ start = cpu()
+ #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
+ currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop)
+ stop = cpu()
+ self.get_time += stop-start
+ dna = currentDNASeq
+
+ proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
+
+ alternativeScores = []
+
+ #pdb.set_trace()
+
+ # inlined
+ h = self.h
+ d = self.d
+ a = self.a
+ mmatrix = self.mmatrix
+ qualityPlifs = self.qualityPlifs
+ # inlined
+
+ # find an intron on the 3' end
+ _start = cpu()
+ for (don_pos,don_score) in proximal_don:
+ DonorScore = calculatePlif(d, [don_score])[0]
+
+ for (acc_pos,acc_score,acc_dna) in distal_acc:
+
+ IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
+ AcceptorScore = calculatePlif(a, [acc_score])[0]
+
+ #print 'don splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
+
+ # construct a new "original_est"
+ original_est_cut=''
+
+ est_ptr=0
+ dna_ptr=self.max_intron_size
+ ptr=0
+ acc_dna_ptr=0
+ num_mismatch = 0
+
+ while ptr<len(original_est):
+ #print acc_dna_ptr,len(acc_dna),acc_pos,don_pos
+
+ if original_est[ptr]=='[':
+ dnaletter=original_est[ptr+1]
+ estletter=original_est[ptr+2]
+ if dna_ptr < don_pos:
+ original_est_cut+=original_est[ptr:ptr+4]
+ num_mismatch += 1
+ else:
+ if acc_dna[acc_dna_ptr]==estletter:
+ original_est_cut += estletter # EST letter
+ else:
+ original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
+ num_mismatch += 1
+ #print '['+acc_dna[acc_dna_ptr]+estletter+']'
+ acc_dna_ptr+=1
+ ptr+=4
+ else:
+ dnaletter=original_est[ptr]
+ estletter=dnaletter
+
+ if dna_ptr < don_pos:
+ original_est_cut+=estletter # EST letter
+ else:
+ if acc_dna[acc_dna_ptr]==estletter:
+ original_est_cut += estletter # EST letter
+ else:
+ num_mismatch += 1
+ original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
+ #print '('+acc_dna[acc_dna_ptr]+estletter+')'
+ acc_dna_ptr+=1
+
+ ptr+=1
+
+ if estletter=='-':
+ dna_ptr+=1
+ elif dnaletter=='-':
+ est_ptr+=1
+ else:
+ dna_ptr+=1
+ est_ptr+=1
+ if num_mismatch>self.max_mismatch:
+ continue
+
+ assert(dna_ptr<=len(dna))
+ assert(est_ptr<=len(est))
+
+ #print original_est, original_est_cut
+
+ score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
+ score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
+
+ alternativeScores.append(score)
+
+ if acc_score>=self.splice_stop_thresh:
+ break
+
+ #pdb.set_trace()
+
+ _stop = cpu()
+ self.alternativeScoresTime += _stop-_start
+
+ # find an intron on the 5' end
+ _start = cpu()
+ for (acc_pos,acc_score) in proximal_acc:
+
+ AcceptorScore = calculatePlif(a, [acc_score])[0]
+
+ for (don_pos,don_score,don_dna) in distal_don:
+
+ DonorScore = calculatePlif(d, [don_score])[0]
+ IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
+
+ #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
+
+ # construct a new "original_est"
+ original_est_cut=''
+
+ est_ptr=0
+ dna_ptr=self.max_intron_size
+ ptr=0
+ num_mismatch = 0
+ don_dna_ptr=len(don_dna)-(acc_pos-self.max_intron_size)-1
+ while ptr<len(original_est):
+
+ if original_est[ptr]=='[':
+ dnaletter=original_est[ptr+1]
+ estletter=original_est[ptr+2]
+ if dna_ptr > acc_pos:
+ original_est_cut+=original_est[ptr:ptr+4]
+ num_mismatch += 1
+ else:
+ if don_dna[don_dna_ptr]==estletter:
+ original_est_cut += estletter # EST letter
+ else:
+ original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
+ num_mismatch += 1
+ #print '['+don_dna[don_dna_ptr]+estletter+']'
+ don_dna_ptr+=1
+ ptr+=4
+ else:
+ dnaletter=original_est[ptr]
+ estletter=dnaletter
+
+ if dna_ptr > acc_pos:
+ original_est_cut+=estletter # EST letter
+ else:
+ if don_dna[don_dna_ptr]==estletter:
+ original_est_cut += estletter # EST letter
+ else:
+ original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
+ num_mismatch += 1
+ #print '('+don_dna[don_dna_ptr]+estletter+')'
+ don_dna_ptr+=1
+
+ ptr+=1
+
+ if estletter=='-':
+ dna_ptr+=1
+ elif dnaletter=='-':
+ est_ptr+=1
+ else:
+ dna_ptr+=1
+ est_ptr+=1
+
+ if num_mismatch>self.max_mismatch:
+ continue
+ assert(dna_ptr<=len(dna))
+ assert(est_ptr<=len(est))
+
+ #print original_est, original_est_cut
+
+ score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
+ score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
+
+ alternativeScores.append(score)
+
+ if don_score>=self.splice_stop_thresh:
+ break
+
+ _stop = cpu()
+ self.alternativeScoresTime += _stop-_start
+
+ return alternativeScores
+
+
+ def calcAlignmentScore(self,alignment):
+ """
+ Given an alignment (dna,exons,est) and the current parameter for QPalma
+ this function calculates the dot product of the feature representation of
+ the alignment and the parameter vector i.e the alignment score.
+ """
+
+ start = cpu()
+ run = self.run
+
+ # Lets start calculation
+ dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
+
+ score = computeSpliceAlignScoreWithQuality(original_est, quality, self.qualityPlifs, run, self.currentPhi)
+
+ stop = cpu()
+ self.calcAlignmentScoreTime += stop-start
+
+ return score
--- /dev/null
+#!/usr/bin/env python
+# -*- coding:latin-1 -*-
+
+
+class Run(dict):
+ """
+ This class represents the needed parameters for one single run of an
+ algorithm. This means that if we for example perform a model selection over
+ C then for each value of C we have to create one Run object storing the
+ respective parameters.
+ """
+
+ def __init__(self):
+ pass
+
+ def __repr__(self):
+
+ result = ""
+ for key,val in self.iteritems():
+ result += "%s:\t\t %s\n" % (key,str(val))
+
+ return result
+
+if __name__ == '__main__':
+ r1 = Run()
+ r1['attrib_1'] = 12
+ r1['attrib_2'] = 22
+
+ print "%s" % r1
+
--- /dev/null
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
+import array
+import cPickle
+import os.path
+import pdb
+import sys
+
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
+
+import numpy
+from numpy.matlib import mat,zeros,ones,inf
+from numpy.linalg import norm
+
+#from qpalma.SIQP_CPX import SIQPSolver
+#from qpalma.SIQP_CVXOPT import SIQPSolver
+
+import QPalmaDP
+import qpalma
+from qpalma.computeSpliceWeights import *
+from qpalma.set_param_palma import *
+from qpalma.computeSpliceAlignWithQuality import *
+from qpalma.TrainingParam import Param
+from qpalma.Plif import Plf,compute_donacc
+
+from qpalma.utils import pprint_alignment, get_alignment
+
+class SpliceSiteException:
+ pass
+
+
+def getData(training_set,exampleKey,run):
+ """ This function... """
+
+ currentSeqInfo,original_est,currentQualities,currentExons = training_set[exampleKey]
+ id,chr,strand,up_cut,down_cut = currentSeqInfo
+
+ est = original_est
+ est = "".join(est)
+ est = est.lower()
+ est = unbracket_est(est)
+ est = est.replace('-','')
+
+ assert len(est) == run['read_size'], pdb.set_trace()
+ est_len = len(est)
+
+ #original_est = OriginalEsts[exampleIdx]
+ original_est = "".join(original_est)
+ original_est = original_est.lower()
+
+ dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+ dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
+
+ #currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
+
+ original_exons = currentExons
+ exons = original_exons - (up_cut-1)
+ exons[0,0] -= 1
+ exons[1,0] -= 1
+
+ if exons.shape == (2,2):
+ fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
+
+ donor_elem = dna[exons[0,1]:exons[0,1]+2]
+ acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
+
+ if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
+ print 'invalid donor in example %d'% exampleKey
+ raise SpliceSiteException
+
+ if not ( acceptor_elem == 'ag' ):
+ print 'invalid acceptor in example %d'% exampleKey
+ raise SpliceSiteException
+
+ assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
+
+ return dna,est,acc_supp,don_supp,exons,original_est,currentQualities
+
+
+class QPalma:
+ """
+ This class wraps the training and prediction functions for
+ the alignment.
+ """
+
+ def __init__(self,run,seqInfo,dmode=False):
+ self.ARGS = Param()
+ self.qpalma_debug_mode = dmode
+ self.run = run
+ self.seqInfo = seqInfo
+
+
+ def plog(self,string):
+ self.logfh.write(string)
+ self.logfh.flush()
+
+
+ def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
+ """
+ Given the needed input this method calls the QPalma C module which
+ calculates a dynamic programming in order to obtain an alignment
+ """
+
+ dna_len = len(dna)
+ est_len = len(est)
+
+ prb = QPalmaDP.createDoubleArrayFromList(quality)
+ chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
+ matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+ mm_len = self.run['matchmatrixRows']*self.run['matchmatrixCols']
+
+ d_len = len(donor)
+ donor = QPalmaDP.createDoubleArrayFromList(donor)
+ a_len = len(acceptor)
+ acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+ # Create the alignment object representing the interface to the C/C++ code.
+ currentAlignment = QPalmaDP.Alignment(self.run['numQualPlifs'],self.run['numQualSuppPoints'], self.use_quality_scores)
+ c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+ # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+ currentAlignment.myalign( current_num_path, dna, dna_len,\
+ est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+ acceptor, a_len, c_qualityPlifs, self.run['remove_duplicate_scores'],\
+ self.run['print_matrix'] )
+
+ if prediction_mode:
+ # part that is only needed for prediction
+ result_len = currentAlignment.getResultLength()
+ dna_array,est_array = currentAlignment.getAlignmentArraysNew()
+ else:
+ dna_array = None
+ est_array = None
+
+ newSpliceAlign, newEstAlign, newWeightMatch, newDPScores, newQualityPlifsFeatures =\
+ currentAlignment.getAlignmentResultsNew()
+
+ return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
+ newQualityPlifsFeatures, dna_array, est_array
+
+
+ def init_train(self,training_set):
+ full_working_path = os.path.join(self.run['alignment_dir'],self.run['name'])
+
+ #assert not os.path.exists(full_working_path)
+ if not os.path.exists(full_working_path):
+ os.mkdir(full_working_path)
+
+ assert os.path.exists(full_working_path)
+
+ # ATTENTION: Changing working directory
+ os.chdir(full_working_path)
+
+ self.logfh = open('_qpalma_train.log','w+')
+ cPickle.dump(self.run,open('run_obj.pickle','w+'))
+
+ self.plog("Settings are:\n")
+ self.plog("%s\n"%str(self.run))
+
+ if self.run['mode'] == 'normal':
+ self.use_quality_scores = False
+
+ elif self.run['mode'] == 'using_quality_scores':
+ self.use_quality_scores = True
+ else:
+ assert(False)
+
+
+ def setUpSolver(self):
+ # Initialize solver
+ self.plog('Initializing problem...\n')
+
+ try:
+ solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
+ except:
+ self.plog('Got no license. Telling queue to reschedule job...\n')
+ sys.exit(99)
+
+ solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
+ solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
+
+ return solver
+
+
+ def train(self,training_set):
+ """
+ The mainloop for training.
+ """
+
+ numExamples = len(training_set)
+ self.plog('Number of training examples: %d\n'% numExamples)
+
+ self.noImprovementCtr = 0
+ self.oldObjValue = 1e8
+
+ iteration_steps = self.run['iter_steps']
+ remove_duplicate_scores = self.run['remove_duplicate_scores']
+ print_matrix = self.run['print_matrix']
+ anzpath = self.run['anzpath']
+
+ # Initialize parameter vector
+ param = numpy.matlib.rand(run['numFeatures'],1)
+
+ lengthSP = self.run['numLengthSuppPoints']
+ donSP = self.run['numDonSuppPoints']
+ accSP = self.run['numAccSuppPoints']
+ mmatrixSP = self.run['matchmatrixRows']*run['matchmatrixCols']
+ numq = self.run['numQualSuppPoints']
+ totalQualSP = self.run['totalQualSuppPoints']
+
+ # no intron length model
+ if not self.run['enable_intron_length']:
+ param[:lengthSP] *= 0.0
+
+ # Set the parameters such as limits penalties for the Plifs
+ [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
+
+ solver = self.setUpSolver()
+
+ # stores the number of alignments done for each example (best path, second-best path etc.)
+ num_path = [anzpath]*numExamples
+
+ # stores the gap for each example
+ gap = [0.0]*numExamples
+
+ currentPhi = zeros((run['numFeatures'],1))
+ totalQualityPenalties = zeros((totalQualSP,1))
+
+ numConstPerRound = run['numConstraintsPerRound']
+ solver_call_ctr = 0
+
+ suboptimal_example = 0
+ iteration_nr = 0
+ param_idx = 0
+ const_added_ctr = 0
+
+ featureVectors = zeros((run['numFeatures'],numExamples))
+
+ self.plog('Starting training...\n')
+ # the main training loop
+ while True:
+ if iteration_nr == iteration_steps:
+ break
+
+ for exampleIdx,example_key in enumerate(training_set.keys()):
+ print 'Current example %d' % example_key
+ try:
+ dna,est,acc_supp,don_supp,exons,original_est,currentQualities =\
+ getData(training_set,example_key,run)
+ except SpliceSiteException:
+ continue
+
+ dna_len = len(dna)
+
+ if run['enable_quality_scores']:
+ quality = currentQualities[quality_index]
+ else:
+ quality = [40]*len(read)
+
+ if not run['enable_splice_signals']:
+ for idx,elem in enumerate(don_supp):
+ if elem != -inf:
+ don_supp[idx] = 0.0
+
+ for idx,elem in enumerate(acc_supp):
+ if elem != -inf:
+ acc_supp[idx] = 0.0
+
+ # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
+ if run['mode'] == 'using_quality_scores':
+ trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
+ computeSpliceAlignWithQuality(dna, exons, est, original_est,\
+ quality, qualityPlifs,run)
+ else:
+ trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+
+ dna_calc = dna_calc.replace('-','')
+
+ #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
+ # Calculate the weights
+ trueWeightDon, trueWeightAcc, trueWeightIntron =\
+ computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+ trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+ currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
+ currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
+ currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
+ currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
+
+ if run['mode'] == 'using_quality_scores':
+ totalQualityPenalties = param[-totalQualSP:]
+ currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
+
+ # Calculate w'phi(x,y) the total score of the alignment
+ trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+
+ # The allWeights vector is supposed to store the weight parameter
+ # of the true alignment as well as the weight parameters of the
+ # num_path[exampleIdx] other alignments
+ allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
+ allWeights[:,0] = trueWeight[:,0]
+
+ AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
+ AlignmentScores[0] = trueAlignmentScore
+
+ ################## Calculate wrong alignment(s) ######################
+ # Compute donor, acceptor with penalty_lookup_new
+ # returns two double lists
+ donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+ ps = h.convert2SWIG()
+
+ newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
+ newQualityPlifsFeatures, unneeded1, unneeded2 =\
+ self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
+ mm_len = run['matchmatrixRows']*run['matchmatrixCols']
+
+ newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
+ newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
+
+ newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
+ # Calculate weights of the respective alignments. Note that we are calculating n-best alignments without
+ # hamming loss, so we have to keep track which of the n-best alignments correspond to the true one in order
+ # not to incorporate a true alignment in the
+ # constraints. To keep track of the true and false alignments we
+ # define an array true_map with a boolean indicating the
+ # equivalence to the true alignment for each decoded alignment.
+ true_map = [0]*(num_path[exampleIdx]+1)
+ true_map[0] = 1
+
+ for pathNr in range(num_path[exampleIdx]):
+ weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
+ h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
+ acc_supp)
+
+ decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
+ decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
+ # Gewichte in restliche Zeilen der Matrix speichern
+ allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
+
+ hpen = mat(h.penalties).reshape(len(h.penalties),1)
+ dpen = mat(d.penalties).reshape(len(d.penalties),1)
+ apen = mat(a.penalties).reshape(len(a.penalties),1)
+ features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
+
+ featureVectors[:,exampleIdx] = allWeights[:,pathNr+1]
+
+ AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+
+ distinct_scores = False
+ if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
+ distinct_scores = True
+
+ # Check wether scalar product + loss equals viterbi score
+ if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
+ self.plog("Scalar prod. + loss not equals Viterbi output!\n")
+ pdb.set_trace()
+
+ self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
+ self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
+
+ # if the pathNr-best alignment is very close to the true alignment consider it as true
+ if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
+ true_map[pathNr+1] = 1
+
+ if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
+ print "suboptimal_example %d\n" %exampleIdx
+ #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
+ #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
+
+ #pdb.set_trace()
+ suboptimal_example += 1
+ self.plog("suboptimal_example %d\n" %exampleIdx)
+
+ # the true label sequence should not have a larger score than the maximal one WHYYYYY?
+ # this means that all n-best paths are to close to each other
+ # we have to extend the n-best search to a (n+1)-best
+ if len([elem for elem in true_map if elem == 1]) == len(true_map):
+ num_path[exampleIdx] = num_path[exampleIdx]+1
+
+ # Choose true and first false alignment for extending
+ firstFalseIdx = -1
+ for map_idx,elem in enumerate(true_map):
+ if elem == 0:
+ firstFalseIdx = map_idx
+ break
+
+ if False:
+ self.plog("Is considered as: %d\n" % true_map[1])
+
+ #result_len = currentAlignment.getResultLength()
+
+ dna_array,est_array = currentAlignment.getAlignmentArraysNew()
+
+ _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
+ _newEstAlign = newEstAlign[0].flatten().tolist()[0]
+
+ # if there is at least one useful false alignment add the
+ # corresponding constraints to the optimization problem
+ if firstFalseIdx != -1:
+ firstFalseWeights = allWeights[:,firstFalseIdx]
+ differenceVector = trueWeight - firstFalseWeights
+ #pdb.set_trace()
+
+ const_added = solver.addConstraint(differenceVector, exampleIdx)
+ const_added_ctr += 1
+
+ # end of one example processing
+
+ # call solver every nth example //added constraint
+ if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
+ objValue,w,self.slacks = solver.solve()
+ solver_call_ctr += 1
+
+ if solver_call_ctr == 5:
+ numConstPerRound = 200
+ self.plog('numConstPerRound is now %d\n'% numConstPerRound)
+
+ if math.fabs(objValue - self.oldObjValue) <= 1e-6:
+ self.noImprovementCtr += 1
+
+ if self.noImprovementCtr == numExamples+1:
+ break
+
+ self.oldObjValue = objValue
+ print "objValue is %f" % objValue
+
+ sum_xis = 0
+ for elem in self.slacks:
+ sum_xis += elem
+
+ print 'sum of slacks is %f'% sum_xis
+ self.plog('sum of slacks is %f\n'% sum_xis)
+
+ for i in range(len(param)):
+ param[i] = w[i]
+
+ cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
+ param_idx += 1
+ [h,d,a,mmatrix,qualityPlifs] =\
+ set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
+
+ ##############################################
+ # end of one iteration through all examples #
+ ##############################################
+
+ self.plog("suboptimal rounds %d\n" %suboptimal_example)
+
+ if self.noImprovementCtr == numExamples*2:
+ FinalizeTraining(param,'param_%d.pickle'%param_idx)
+
+ iteration_nr += 1
+
+ #
+ # end of optimization
+ #
+ FinalizeTraining(param,'param_%d.pickle'%param_idx)
+
+
+ def FinalizeTraining(self,vector,name):
+ self.plog("Training completed")
+ cPickle.dump(param,open(name,'w+'))
+ self.logfh.close()
+
+
+###############################################################################
+#
+# End of the code needed for training
+#
+# Begin of code for prediction
+#
+###############################################################################
+
+
+ def init_prediction(self,dataset_fn,prediction_keys,param_fn,set_name):
+ """
+ Performing a prediction takes...
+ """
+ self.set_name = set_name
+
+ #full_working_path = os.path.join(run['alignment_dir'],run['name'])
+ full_working_path = self.run['result_dir']
+
+ print 'full_working_path is %s' % full_working_path
+
+ #assert not os.path.exists(full_working_path)
+ if not os.path.exists(full_working_path):
+ os.mkdir(full_working_path)
+
+ assert os.path.exists(full_working_path)
+
+ # ATTENTION: Changing working directory
+ os.chdir(full_working_path)
+
+ self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
+
+ # number of prediction instances
+ self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
+
+ # load dataset and fetch instances that shall be predicted
+ dataset = cPickle.load(open(dataset_fn))
+
+ prediction_set = {}
+ for key in prediction_keys:
+ prediction_set[key] = dataset[key]
+
+ # we do not need the full dataset anymore
+ del dataset
+
+ # Load parameter vector to predict with
+ param = cPickle.load(open(param_fn))
+
+ self.predict(prediction_set,param)
+
+
+ def predict(self,prediction_set,param):
+ """
+ This method...
+ """
+
+ if self.run['mode'] == 'normal':
+ self.use_quality_scores = False
+
+ elif self.run['mode'] == 'using_quality_scores':
+ self.use_quality_scores = True
+ else:
+ assert(False)
+
+ # Set the parameters such as limits/penalties for the Plifs
+ [h,d,a,mmatrix,qualityPlifs] =\
+ set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
+
+ if not self.qpalma_debug_mode:
+ self.plog('Starting prediction...\n')
+
+ self.problem_ctr = 0
+
+ # where we store the predictions
+ allPredictions = []
+
+ # we take the first quality vector of the tuple of quality vectors
+ quality_index = 0
+
+ # beginning of the prediction loop
+ for example_key in prediction_set.keys():
+ print 'Current example %d' % example_key
+ for example in prediction_set[example_key]:
+
+ currentSeqInfo,read,currentQualities = example
+
+ id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
+ currentSeqInfo
+
+ if not self.qpalma_debug_mode:
+ self.plog('Loading example id: %d...\n'% int(id))
+
+ if self.run['enable_quality_scores']:
+ quality = currentQualities[quality_index]
+ else:
+ quality = [40]*len(read)
+
+ try:
+ currentDNASeq, currentAcc, currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
+ except:
+ self.problem_ctr += 1
+ continue
+
+ if not self.run['enable_splice_signals']:
+ for idx,elem in enumerate(currentDon):
+ if elem != -inf:
+ currentDon[idx] = 0.0
+
+ for idx,elem in enumerate(currentAcc):
+ if elem != -inf:
+ currentAcc[idx] = 0.0
+
+ current_prediction = self.calc_alignment(currentDNASeq, read,\
+ quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
+
+ current_prediction['id'] = id
+ current_prediction['chr'] = chromo
+ current_prediction['strand'] = strand
+ current_prediction['start_pos'] = genomicSeq_start
+
+ allPredictions.append(current_prediction)
+
+ if not self.qpalma_debug_mode:
+ self.FinalizePrediction(allPredictions)
+ else:
+ return allPredictions
+
+
+ def FinalizePrediction(self,allPredictions):
+ """ End of the prediction loop we save all predictions in a pickle file and exit """
+
+ cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
+ self.plog('Prediction completed\n')
+ mes = 'Problem ctr %d' % self.problem_ctr
+ self.plog(mes+'\n')
+ self.logfh.close()
+
+
+ def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
+ """
+ Given two sequences and the parameters we calculate on alignment
+ """
+
+ run = self.run
+ donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+ if '-' in read:
+ self.plog('found gap\n')
+ read = read.replace('-','')
+ assert len(read) == Conf.read_size
+
+ ps = h.convert2SWIG()
+
+ newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
+ newQualityPlifsFeatures, dna_array, read_array =\
+ self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
+
+ mm_len = run['matchmatrixRows']*run['matchmatrixCols']
+
+ true_map = [0]*2
+ true_map[0] = 1
+ pathNr = 0
+
+ _newSpliceAlign = array.array('B',newSpliceAlign)
+ _newEstAlign = array.array('B',newEstAlign)
+
+ #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
+ alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array)
+
+ dna_array = array.array('B',dna_array)
+ read_array = array.array('B',read_array)
+
+ newExons = self.calculatePredictedExons(newSpliceAlign)
+
+ current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
+ 'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
+ 'dna_array':dna_array, 'read_array':read_array }
+
+ return current_prediction
+
+
+ def calculatePredictedExons(self,SpliceAlign):
+ newExons = []
+ oldElem = -1
+ SpliceAlign.append(-1)
+ for pos,elem in enumerate(SpliceAlign):
+ if pos == 0:
+ oldElem = -1
+ else:
+ oldElem = SpliceAlign[pos-1]
+
+ if oldElem != 0 and elem == 0: # start of exon
+ newExons.append(pos)
+
+ if oldElem == 0 and elem != 0: # end of exon
+ newExons.append(pos)
+
+ return newExons
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import cPickle
-import sys
-import pdb
-import os
-import os.path
-import math
-
-from qpalma.parsers import *
-
-
-data = None
-
-
-def result_statistic():
- """
-
- """
-
- num_unaligned_reads = 0
- num_incorrectly_aligned_reads = 0
- pass
-
-
-def createErrorVSCutPlot(results):
- """
- This function takes the results of the evaluation and creates a tex table.
- """
-
- fh = open('error_rates_table.tex','w+')
- lines = ['\\begin{tabular}{|c|c|c|r|}', '\hline',\
- 'Quality & Splice & Intron & \multicolumn{1}{c|}{Error on Positions} & \multicolumn{1}{c|}{Error on Scores} & \\',\
- 'information & site pred. & length & \multicolumn{1}{c|}{rate}\\', '\hline']
-
- #for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
- for pos,key in enumerate(['+++']):
- res = results[key]
- for i in range(37):
- ctr = 0
- try:
- ctr = res[1][i]
- except:
- ctr = 0
-
- lines.append( '%d\n' % ctr)
-
- if pos % 2 == 1:
- lines.append('\hline')
-
- lines.append('\end{tabular}')
-
- lines = [l+'\n' for l in lines]
- for l in lines:
- fh.write(l)
- fh.close()
-
-
-def createTable(results):
- """
- This function takes the results of the evaluation and creates a tex table.
- """
-
- fh = open('result_table.tex','w+')
- lines = ['\\begin{tabular}{|c|c|c|r|}', '\hline',\
- 'Quality & Splice & Intron & \multicolumn{1}{c|}{Error on Positions} & \multicolumn{1}{c|}{Error on Scores} & \\',\
- 'information & site pred. & length & \multicolumn{1}{c|}{rate}\\', '\hline']
-
- for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
- res = [e*100 for e in results[key]]
-
- lines.append( '%s & %s & %s & %2.2f & %2.2f \\%%\\\\' % ( key[0], key[1], key[2], res[0], res[1] ) )
- if pos % 2 == 1:
- lines.append('\hline')
-
- for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
- res = [e*100 for e in results[key]]
-
- lines.append( '%s & %s & %s & %2.2f & x \\%%\\\\' % ( key[0], key[1], key[2], res[2] ) )
- if pos % 2 == 1:
- lines.append('\hline')
-
- lines.append('\end{tabular}')
-
- lines = [l+'\n' for l in lines]
- for l in lines:
- fh.write(l)
- fh.close()
-
-
-def compare_scores_and_labels(scores,labels):
- """
- Iterate through all predictions. If we find a correct prediction check
- whether this correct prediction scores higher than the incorrect
- predictions for this example.
- """
-
- for currentPos,currentElem in enumerate(scores):
- if labels[currentPos] == True:
- for otherPos,otherElem in enumerate(scores):
- if otherPos == currentPos:
- continue
-
- if labels[otherPos] == False and otherElem >= currentElem:
- return False
-
- return True
-
-
-def compare_exons(predExons,trueExons):
- e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
-
- if len(predExons) == 4:
- e1_begin,e1_end = predExons[0],predExons[1]
- e2_begin,e2_end = predExons[2],predExons[3]
- else:
- return False
-
- e1_b_off = int(math.fabs(e1_begin - trueExons[0,0]))
- e1_e_off = int(math.fabs(e1_end - trueExons[0,1]))
-
- e2_b_off = int(math.fabs(e2_begin - trueExons[1,0]))
- e2_e_off = int(math.fabs(e2_end - trueExons[1,1]))
-
- if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0\
- and e2_e_off == 0:
- return True
-
- return False
-
-
-def evaluate_unmapped_example(current_prediction):
- predExons = current_prediction['predExons']
- trueExons = current_prediction['trueExons']
-
- result = compare_exons(predExons,trueExons)
- return result
-
-
-def evaluate_example(current_prediction):
- label = False
- label = current_prediction['label']
-
- pred_score = current_prediction['DPScores'].flatten().tolist()[0][0]
-
- # if the read was mapped by vmatch at an incorrect position we only have to
- # compare the score
- if label == False:
- return label,False,pred_score
-
- predExons = current_prediction['predExons']
- trueExons = current_prediction['trueExons']
-
- predPositions = [elem + current_prediction['alternative_start_pos'] for elem in predExons]
- truePositions = [elem + current_prediction['start_pos'] for elem in trueExons.flatten().tolist()[0]]
-
- pos_comparison = (predPositions == truePositions)
-
- return label,pos_comparison,pred_score
-
-
-def prediction_on(filename):
-
- allPredictions = cPickle.load(open(filename))
-
- gt_correct_ctr = 0
- gt_incorrect_ctr = 0
- incorrect_gt_cuts = {}
-
- pos_correct_ctr = 0
- pos_incorrect_ctr = 0
- incorrect_vmatch_cuts = {}
-
- score_correct_ctr = 0
- score_incorrect_ctr = 0
-
- total_gt_examples = 0
- total_vmatch_instances_ctr = 0
-
- true_vmatch_instances_ctr = 0
-
- allUniquePredictions = [False]*len(allPredictions)
-
- for pos,current_example_pred in enumerate(allPredictions):
- for elem_nr,new_prediction in enumerate(current_example_pred[1:]):
-
- if allUniquePredictions[pos] != False:
- current_prediction = allUniquePredictions[pos]
-
- current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
- new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
-
- if current_a_score < new_score :
- allUniquePredictions[id] = new_prediction
-
- else:
- allUniquePredictions[pos] = new_prediction
-
- for current_pred in allUniquePredictions:
- if current_pred == False:
- continue
-
- #for current_example_pred in allPredictions:
- #gt_example = current_example_pred[0]
- #gt_score = gt_example['DPScores'].flatten().tolist()[0][0]
- #gt_correct = evaluate_unmapped_example(gt_example)
-
- #exampleIdx = gt_example['exampleIdx']
-
- #cut_pos = gt_example['true_cut']
-
- #if gt_correct:
- # gt_correct_ctr += 1
- #else:
- # gt_incorrect_ctr += 1
-
- # try:
- # incorrect_gt_cuts[cut_pos] += 1
- # except:
- # incorrect_gt_cuts[cut_pos] = 1
-
- #total_gt_examples += 1
-
- #current_scores = []
- #current_labels = []
- #for elem_nr,current_pred in enumerate(current_example_pred[1:]):
-
- current_label,comparison_result,current_score = evaluate_example(current_pred)
-
- # if vmatch found the right read pos we check for right exons
- # boundaries
- #if current_label:
- if comparison_result:
- pos_correct_ctr += 1
- else:
- pos_incorrect_ctr += 1
-
- #try:
- # incorrect_vmatch_cuts[cut_pos] += 1
- #except:
- # incorrect_vmatch_cuts[cut_pos] = 1
-
- true_vmatch_instances_ctr += 1
-
- #current_scores.append(current_score)
- #current_labels.append(current_label)
-
- total_vmatch_instances_ctr += 1
-
- # check whether the correct predictions score higher than the incorrect
- # ones
- #cmp_res = compare_scores_and_labels(current_scores,current_labels)
- #if cmp_res:
- # score_correct_ctr += 1
- #else:
- # score_incorrect_ctr += 1
-
- # now that we have evaluated all instances put out all counters and sizes
- print 'Total num. of examples: %d' % len(allPredictions)
- print 'Number of correct ground truth examples: %d' % gt_correct_ctr
- print 'Total num. of true vmatch instances %d' % true_vmatch_instances_ctr
- print 'Correct pos: %d, incorrect pos: %d' % (pos_correct_ctr,pos_incorrect_ctr)
- print 'Total num. of vmatch instances %d' % total_vmatch_instances_ctr
- print 'Correct scores: %d, incorrect scores: %d' %\
- (score_correct_ctr,score_incorrect_ctr)
-
- pos_error = 1.0 * pos_incorrect_ctr / total_vmatch_instances_ctr
- score_error = 1.0 * score_incorrect_ctr / total_vmatch_instances_ctr
- gt_error = 1.0 * gt_incorrect_ctr / total_gt_examples
-
- return (pos_error,score_error,gt_error,incorrect_gt_cuts,incorrect_vmatch_cuts)
-
-
-def collect_prediction(current_dir,run_name):
- """
- Given the toplevel directory this function takes care that for each distinct
- experiment the training and test predictions are evaluated.
-
- """
- idx = 5
-
- train_suffix = '_%d_allPredictions_TRAIN' % (idx)
- test_suffix = '_%d_allPredictions_TEST' % (idx)
-
- jp = os.path.join
- b2s = ['-','+']
-
- currentRun = cPickle.load(open(jp(current_dir,'run_object_%d.pickle'%(idx))))
- QFlag = currentRun['enable_quality_scores']
- SSFlag = currentRun['enable_splice_signals']
- ILFlag = currentRun['enable_intron_length']
- currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
-
- #filename = jp(current_dir,run_name)+train_suffix
- #print 'Prediction on: %s' % filename
- #train_result = prediction_on(filename)
- train_result = []
-
- filename = jp(current_dir,run_name)+test_suffix
- print 'Prediction on: %s' % filename
- test_result = prediction_on(filename)
-
- return train_result,test_result,currentRunId
-
-
-def perform_prediction(current_dir,run_name):
- """
- This function takes care of starting the jobs needed for the prediction phase
- of qpalma
- """
-
- #for i in range(1,6):
- for i in range(1,2):
- cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s %d |\
- qsub -l h_vmem=12.0G -cwd -j y -N \"%s_%d.log\"'%(current_dir,i,run_name,i)
-
- #cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
- #print cmd
- os.system(cmd)
-
-
-
-def forall_experiments(current_func,tl_dir):
- """
- Given the toplevel directoy this function calls for each subdir the
- function given as first argument. Which are at the moment:
-
- - perform_prediction, and
- - collect_prediction.
-
- """
-
- dir_entries = os.listdir(tl_dir)
- dir_entries = [os.path.join(tl_dir,de) for de in dir_entries]
- run_dirs = [de for de in dir_entries if os.path.isdir(de)]
-
- all_results = {}
- all_error_rates = {}
-
- for current_dir in run_dirs:
- run_name = current_dir.split('/')[-1]
-
- pdb.set_trace()
-
- if current_func.__name__ == 'perform_prediction':
- current_func(current_dir,run_name)
-
- if current_func.__name__ == 'collect_prediction':
- train_result,test_result,currentRunId = current_func(current_dir,run_name)
- all_results[currentRunId] = test_result
- pos_error,score_error,gt_error,incorrect_gt_cuts,incorrect_vmatch_cuts = test_result
- all_error_rates[currentRunId] = (incorrect_gt_cuts,incorrect_vmatch_cuts)
-
- if current_func.__name__ == 'collect_prediction':
- #createErrorVSCutPlot(all_error_rates)
- createTable(all_results)
-
-def _predict_on(filename,filtered_reads,with_coverage):
- """
- This function evaluates the predictions made by QPalma.
- It needs a pickled file containing the predictions themselves and the
- ascii file with original reads.
-
- Optionally one can specifiy a coverage file containing for each read the
- coverage number estimated by a remapping step.
-
- """
-
- coverage_map = {}
-
- if with_coverage:
- for line in open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/coverage_results/ALL_COVERAGES'):
- id,coverage_nr = line.strip().split()
- coverage_map[int(id)] = int(coverage_nr)
-
- print 'parsing filtered reads..'
- all_filtered_reads = parse_filtered_reads(filtered_reads)
- print 'found %d filtered reads' % len(all_filtered_reads)
-
- out_fh = open('predicted_positions.txt','w+')
-
- allPredictions = cPickle.load(open(filename))
-
- spliced_ctr = 0
- unspliced_ctr = 0
-
- pos_correct_ctr = 0
- pos_incorrect_ctr = 0
-
- correct_spliced_ctr = 0
- correct_unspliced_ctr = 0
-
- incorrect_spliced_ctr = 0
- incorrect_unspliced_ctr = 0
-
- correct_covered_splice_ctr = 0
- incorrect_covered_splice_ctr = 0
-
- total_vmatch_instances_ctr = 0
-
- unspliced_spliced_reads_ctr = 0
- wrong_spliced_reads_ctr = 0
-
- wrong_aligned_unspliced_reads_ctr = 0
- wrong_unspliced_reads_ctr = 0
-
- cut_pos_ctr = {}
-
- total_ctr = 0
- skipped_ctr = 0
-
- is_spliced = False
- min_coverage = 3
-
- allUniqPredictions = {}
-
- print 'Got %d predictions' % len(allPredictions)
-
- for new_prediction in allPredictions:
- id = new_prediction['id']
- id = int(id)
-
- if allUniqPredictions.has_key(id):
- current_prediction = allUniqPredictions[id]
-
- current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
- new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
-
- if current_a_score < new_score :
- allUniqPredictions[id] = new_prediction
-
- else:
- allUniqPredictions[id] = new_prediction
-
- print 'Got %d uniq predictions' % len(allUniqPredictions)
-
- #for current_prediction in allPredictions:
- for _id,current_prediction in allUniqPredictions.items():
- id = current_prediction['id']
- id = int(id)
-
- if not id >= 1000000300000:
- is_spliced = True
- else:
- is_spliced = False
-
- is_covered = False
-
- if is_spliced and with_coverage:
- try:
- current_coverage_nr = coverage_map[id]
- is_covered = True
- except:
- is_covered = False
-
-
- if is_spliced:
- spliced_ctr += 1
- else:
- unspliced_ctr += 1
-
- try:
- current_ground_truth = all_filtered_reads[id]
- except:
- skipped_ctr += 1
- continue
-
- start_pos = current_prediction['start_pos']
- chr = current_prediction['chr']
- strand = current_prediction['strand']
-
- #score = current_prediction['DPScores'].flatten().tolist()[0][0]
- #pdb.set_trace()
-
- predExons = current_prediction['predExons'] #:newExons, 'dna':dna, 'est':est
- predExons = [e+start_pos for e in predExons]
-
- spliced_flag = False
-
- if len(predExons) == 4:
- spliced_flag = True
- predExons[1] -= 1
- predExons[3] -= 1
-
- if predExons[0] == 19504568:
- pdb.set_trace()
-
- cut_pos = current_ground_truth['true_cut']
- p_start = current_ground_truth['p_start']
- e_stop = current_ground_truth['exon_stop']
- e_start = current_ground_truth['exon_start']
- p_stop = current_ground_truth['p_stop']
-
- true_cut = current_ground_truth['true_cut']
-
- if p_start == predExons[0] and e_stop == predExons[1] and\
- e_start == predExons[2] and p_stop == predExons[3]:
- pos_correct = True
- else:
- pos_correct = False
-
- elif len(predExons) == 2:
- spliced_flag = False
- predExons[1] -= 1
-
- cut_pos = current_ground_truth['true_cut']
- p_start = current_ground_truth['p_start']
- p_stop = current_ground_truth['p_stop']
-
- true_cut = current_ground_truth['true_cut']
-
- if math.fabs(p_start - predExons[0]) <= 0:# and math.fabs(p_stop - predExons[1]) <= 2:
- pos_correct = True
- else:
- pos_correct = False
-
- else:
- pos_correct = False
-
- if is_spliced and not spliced_flag:
- unspliced_spliced_reads_ctr += 1
-
- if is_spliced and not pos_correct and len(predExons) == 4 and predExons[1]!=-1:
- wrong_spliced_reads_ctr += 1
-
- if not is_spliced and spliced_flag:
- wrong_unspliced_reads_ctr += 1
-
- if not is_spliced and not pos_correct:
- wrong_aligned_unspliced_reads_ctr += 1
-
- if pos_correct:
- pos_correct_ctr += 1
-
- if is_spliced:
- correct_spliced_ctr += 1
- if with_coverage and is_covered and current_coverage_nr >= min_coverage:
- correct_covered_splice_ctr += 1
-
- if not is_spliced:
- correct_unspliced_ctr += 1
-
- else:
- pos_incorrect_ctr += 1
-
- if is_spliced:
- incorrect_spliced_ctr += 1
- if with_coverage and is_covered and current_coverage_nr >= min_coverage:
- incorrect_covered_splice_ctr += 1
-
- if not is_spliced:
- incorrect_unspliced_ctr += 1
-
- if with_coverage and spliced_flag:
- if not is_covered:
- current_coverage_nr=0
- if pos_correct:
- print "%s\tcorrect\t%i" %( current_prediction['id'], current_coverage_nr)
- else:
- print "%s\twrong\t%i" %( current_prediction['id'], current_coverage_nr)
-
- total_ctr += 1
-
-
- numPredictions = len(allUniqPredictions)
-
- # now that we have evaluated all instances put out all counters and sizes
- print 'Total num. of examples: %d' % numPredictions
-
- print "spliced/unspliced: %d,%d " % (spliced_ctr, unspliced_ctr )
- print "Correct/incorrect spliced: %d,%d " % (correct_spliced_ctr, incorrect_spliced_ctr )
- print "Correct/incorrect unspliced: %d,%d " % (correct_unspliced_ctr , incorrect_unspliced_ctr )
- print "Correct/incorrect covered spliced read: %d,%d " %\
- (correct_covered_splice_ctr,incorrect_covered_splice_ctr)
-
- print "pos_correct: %d,%d" % (pos_correct_ctr , pos_incorrect_ctr )
-
- print 'unspliced_spliced reads: %d' % unspliced_spliced_reads_ctr
- print 'spliced reads at wrong_place: %d' % wrong_spliced_reads_ctr
-
- print 'spliced_unspliced reads: %d' % wrong_unspliced_reads_ctr
- print 'wrong aligned at wrong_pos: %d' % wrong_aligned_unspliced_reads_ctr
-
- print 'total_ctr: %d' % total_ctr
-
- print "skipped: %d " % skipped_ctr
- print 'min. coverage: %d' % min_coverage
-
- result_dict = {}
- result_dict['skipped_ctr'] = skipped_ctr
- result_dict['min_coverage'] = min_coverage
-
- return result_dict
-
-
-
-
-
-def predict_on(allPredictions,all_filtered_reads,all_labels_fn,with_coverage,coverage_fn,coverage_labels_fn):
- """
- This function evaluates the predictions made by QPalma.
- It needs a pickled file containing the predictions themselves and the
- ascii file with original reads.
-
- Optionally one can specifiy a coverage file containing for each read the
- coverage number estimated by a remapping step.
-
-
- """
-
- coverage_labels_fh = open(coverage_labels_fn,'w+')
-
- all_labels_fh = open(all_labels_fn,'w+')
-
- import qparser
- qparser.parse_reads(all_filtered_reads)
-
- coverage_map = {}
-
-
- if with_coverage:
- for line in open(coverage_fn):
- id,coverage_nr = line.strip().split()
- coverage_map[int(id)] = int(coverage_nr)
-
- #out_fh = open('predicted_positions.txt','w+')
-
- spliced_ctr = 0
- unspliced_ctr = 0
-
- pos_correct_ctr = 0
- pos_incorrect_ctr = 0
-
- correct_spliced_ctr = 0
- correct_unspliced_ctr = 0
-
- incorrect_spliced_ctr = 0
- incorrect_unspliced_ctr = 0
-
- correct_covered_splice_ctr = 0
- incorrect_covered_splice_ctr = 0
-
- total_vmatch_instances_ctr = 0
-
- unspliced_spliced_reads_ctr = 0
- wrong_spliced_reads_ctr = 0
-
- wrong_aligned_unspliced_reads_ctr = 0
- wrong_unspliced_reads_ctr = 0
-
- cut_pos_ctr = {}
-
- total_ctr = 0
- skipped_ctr = 0
-
- is_spliced = False
- min_coverage = 3
-
- allUniqPredictions = {}
-
- print 'Got %d predictions' % len(allPredictions)
-
- for k,predictions in allPredictions.items():
- for new_prediction in predictions:
- id = new_prediction['id']
- id = int(id)
-
- if allUniqPredictions.has_key(id):
- current_prediction = allUniqPredictions[id]
-
- current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
- new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
-
- if current_a_score < new_score :
- allUniqPredictions[id] = new_prediction
-
- else:
- allUniqPredictions[id] = new_prediction
-
- print 'Got %d uniq predictions' % len(allUniqPredictions)
-
- for _id,current_prediction in allUniqPredictions.items():
- id = current_prediction['id']
- id = int(id)
-
- if not id >= 1000000300000:
- is_spliced = True
- else:
- is_spliced = False
-
- is_covered = False
-
- if is_spliced and with_coverage:
- try:
- current_coverage_nr = coverage_map[id]
- is_covered = True
- except:
- is_covered = False
-
-
- if is_spliced:
- spliced_ctr += 1
- else:
- unspliced_ctr += 1
-
- try:
- #current_ground_truth = all_filtered_reads[id]
- current_ground_truth = qparser.fetch_read(id)
- except:
- skipped_ctr += 1
- continue
-
- start_pos = current_prediction['start_pos']
- chr = current_prediction['chr']
- strand = current_prediction['strand']
-
- #score = current_prediction['DPScores'].flatten().tolist()[0][0]
- #pdb.set_trace()
-
- predExons = current_prediction['predExons'] #:newExons, 'dna':dna, 'est':est
- predExons = [e+start_pos for e in predExons]
-
- spliced_flag = False
-
- if len(predExons) == 4:
- spliced_flag = True
- predExons[1] -= 1
- predExons[3] -= 1
-
- cut_pos = current_ground_truth['true_cut']
- p_start = current_ground_truth['p_start']
- e_stop = current_ground_truth['exon_stop']
- e_start = current_ground_truth['exon_start']
- p_stop = current_ground_truth['p_stop']
-
- true_cut = current_ground_truth['true_cut']
-
- if p_start == predExons[0] and e_stop == predExons[1] and\
- e_start == predExons[2] and p_stop == predExons[3]:
- pos_correct = True
- else:
- pos_correct = False
-
- elif len(predExons) == 2:
- spliced_flag = False
- predExons[1] -= 1
-
- cut_pos = current_ground_truth['true_cut']
- p_start = current_ground_truth['p_start']
- p_stop = current_ground_truth['p_stop']
-
- true_cut = current_ground_truth['true_cut']
-
- if math.fabs(p_start - predExons[0]) <= 0:# and math.fabs(p_stop - predExons[1]) <= 2:
- pos_correct = True
- else:
- pos_correct = False
-
- else:
- pos_correct = False
-
- if is_spliced and not spliced_flag:
- unspliced_spliced_reads_ctr += 1
-
- if is_spliced and not pos_correct and len(predExons) == 4 and predExons[1]!=-1:
- wrong_spliced_reads_ctr += 1
-
- if not is_spliced and spliced_flag:
- wrong_unspliced_reads_ctr += 1
-
- if not is_spliced and not pos_correct:
- wrong_aligned_unspliced_reads_ctr += 1
-
- if pos_correct:
- pos_correct_ctr += 1
-
- if is_spliced:
- correct_spliced_ctr += 1
- all_labels_fh.write('%d correct\n'%id)
- if with_coverage and is_covered and current_coverage_nr >= min_coverage:
- correct_covered_splice_ctr += 1
-
- if not is_spliced:
- correct_unspliced_ctr += 1
-
- else:
- pos_incorrect_ctr += 1
-
- if is_spliced:
- incorrect_spliced_ctr += 1
- all_labels_fh.write('%d wrong\n'%id)
- if with_coverage and is_covered and current_coverage_nr >= min_coverage:
- incorrect_covered_splice_ctr += 1
-
- if not is_spliced:
- incorrect_unspliced_ctr += 1
-
- if with_coverage:
- if not is_covered:
- current_coverage_nr=0
-
- if pos_correct:
- new_line = "%s\tcorrect\t%i" %( current_prediction['id'], current_coverage_nr)
- else:
- new_line = "%s\twrong\t%i" %( current_prediction['id'], current_coverage_nr)
-
- coverage_labels_fh.write(new_line+'\n')
-
- total_ctr += 1
-
- coverage_labels_fh.close()
-
- numPredictions = len(allUniqPredictions)
-
- result = []
-
- # now that we have evaluated all instances put out all counters and sizes
- result.append(('numPredictions',numPredictions))
- result.append(('spliced_ctr',spliced_ctr))
- result.append(('unspliced_ctr',unspliced_ctr))
-
- result.append(('correct_spliced_ctr',correct_spliced_ctr))
- result.append(('incorrect_spliced_ctr',incorrect_spliced_ctr))
-
- result.append(('correct_unspliced_ctr',correct_unspliced_ctr))
- result.append(('incorrect_unspliced_ctr',incorrect_unspliced_ctr))
-
- result.append(('correct_covered_splice_ctr',correct_covered_splice_ctr))
- result.append(('incorrect_covered_splice_ctr',incorrect_covered_splice_ctr))
-
- result.append(('pos_correct_ctr',pos_correct_ctr))
- result.append(('pos_incorrect_ctr',pos_incorrect_ctr))
-
- result.append(('unspliced_spliced_reads_ctr',unspliced_spliced_reads_ctr))
- result.append(('wrong_spliced_reads_ctr',wrong_spliced_reads_ctr))
-
- result.append(('wrong_unspliced_reads_ctr',wrong_unspliced_reads_ctr))
- result.append(('wrong_aligned_unspliced_reads_ctr',wrong_aligned_unspliced_reads_ctr))
-
- result.append(('total_ctr',total_ctr))
-
- result.append(('skipped_ctr',skipped_ctr))
- result.append(('min_coverage',min_coverage))
-
- return result
-
-
-
-def print_result(result):
- # now that we have evaluated all instances put out all counters and sizes
- for name,ctr in result:
- print name,ctr
-
-
-def load_chunks(current_dir):
- chunks_fn = []
- for fn in os.listdir(current_dir):
- if fn.startswith('chunk'):
- chunks_fn.append(fn)
-
- allPredictions = []
-
- for c_fn in chunks_fn:
- full_fn = os.path.join(current_dir,c_fn)
- print full_fn
- current_chunk = cPickle.load(open(full_fn))
- allPredictions.extend(current_chunk)
-
- return allPredictions
-
-
-def predict_on_all_chunks(current_dir,training_keys_fn):
- """
- We load all chunks from the current_dir belonging to one run.
- Then we load the saved keys of the training set to restore the training and
- testing sets.
- Once we have done that we separately evaluate both sets.
-
- """
-
- allPredictions = load_chunks(current_dir)
-
- allPredictionsDict = {}
- for elem in allPredictions:
- id = elem['id']
-
- if allPredictionsDict.has_key(id):
- old_entry = allPredictionsDict[id]
- old_entry.append(elem)
- allPredictionsDict[id] = old_entry
- else:
- allPredictionsDict[id] = [elem]
-
- training_keys = cPickle.load(open(training_keys_fn))
-
- training_set = {}
- for key in training_keys:
- # we have the try construct because some of the reads used for training
- # may not be found using vmatch at all
- try:
- training_set[key] = allPredictionsDict[key]
- del allPredictionsDict[key]
- except:
- pass
-
- test_set = allPredictionsDict
-
- #test_set = {}
- #for k in allPredictionsDict.keys()[:100]:
- # test_set[k] = allPredictionsDict[k]
- #result_train = predict_on(training_set,all_filtered_reads,False,coverage_fn)
- #pdb.set_trace()
-
- # this is the heuristic.parsed_spliced_reads.txt file from the vmatch remapping step
- coverage_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/all_coverages'
-
- all_filtered_reads = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
-
- coverage_labels_fn = 'COVERAGE_LABELS'
-
- result_test = predict_on(test_set,all_filtered_reads,'all_prediction_labels.txt',True,coverage_fn,coverage_labels_fn)
- #print_result(result_train)
-
- return result_test
-
-
-if __name__ == '__main__':
- pass
+++ /dev/null
-###############################################################################
-#
-# This file contains settings for one experiment
-#
-# The general idea is as follows:
-#
-# Suppose you have an machine learning algorithm you want to perform model
-# selection with. Then for each different value of for example C for a C-SVM this
-# script generates a Run object a subclass of dict storing the parameters.
-#
-###############################################################################
-
-import QPalmaConfiguration as Conf
-from Run import *
-import pdb
-import os
-import os.path
-import cPickle
-
-def get_dataset(dataset_size,num_nodes):
- all_instances = []
-
- params = [\
- ('prediction_begin',400000),\
- ('prediction_end',440000)]
-
- all_instances.append(params)
-
- return all_instances
-
-
-def get_dataset_splitting_instances(dataset_size,num_nodes):
- all_instances = []
-
- part = dataset_size / num_nodes
- begin = 0
- end = 0
- for idx in range(1,num_nodes+1):
-
- if idx == num_nodes:
- begin = end
- end = dataset_size
- else:
- begin = end
- end = begin+part
-
- params = [\
- ('prediction_begin',begin),\
- ('prediction_end',end)]
-
- all_instances.append(params)
-
- return all_instances
-
-
-def get_training_instances():
- all_instances = []
-
- params = [\
- ('enable_quality_scores',True),\
- ('enable_splice_signals',True),\
- ('enable_intron_length',True)]
-
- all_instances.append(params)
-
- return all_instances
-
-
-def get_scoring_instances():
- all_instances = []
-
- for QFlag in [True,False]:
- for SSFlag in [True,False]:
- for ILFlag in [True,False]:
- params = [\
- ('enable_quality_scores',QFlag),\
- ('enable_splice_signals',SSFlag),\
- ('enable_intron_length',ILFlag)]
- all_instances.append(params)
-
- return all_instances
-
-
-def createRuns():
- # load the configuration object
- Config = cPickle.load(open(Conf.conf_object_path))
-
- # the main directory where all results are stored
- alignment_dir = Config['alignment_dir']
-
- # list of regularization parameters and additional flags for different runs
- # for example:
- # - with quality scores
- # - without quality scores
- #
- bool2str = ['-','+']
- ctr = 1
-
- all_instances = get_training_instances()
-
- allRuns = []
- for parameters in all_instances:
- # create a new Run object
- currentRun = Run()
- currentName = 'run'
-
- for param_name,param in parameters:
- currentRun[param_name] = param
- currentName += '_%s_%s' % (str(param_name),str(bool2str[param]))
-
- # global settings for all runs
- currentRun['anzpath'] = Conf.anzpath
- currentRun['iter_steps'] = Conf.iter_steps
- currentRun['matchmatrixRows'] = Conf.sizeMatchmatrix[0]
- currentRun['matchmatrixCols'] = Conf.sizeMatchmatrix[1]
- currentRun['mode'] = Conf.mode
- currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound
-
- currentRun['remove_duplicate_scores'] = Conf.remove_duplicate_scores
- currentRun['print_matrix'] = Conf.print_matrix
- currentRun['read_size'] = Conf.read_size
-
- currentRun['numDonSuppPoints'] = 10
- currentRun['numAccSuppPoints'] = 10
-
- currentRun['numQualPlifs'] = Conf.numQualPlifs
- currentRun['numQualSuppPoints'] = 10
- currentRun['totalQualSuppPoints'] = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
-
- currentRun['enable_quality_scores'] = True
- currentRun['enable_splice_signals'] = True
- currentRun['enable_intron_length'] = True
-
- # if we are not using an intron length model at all we do not need the support points
- currentRun['numLengthSuppPoints'] = 10 #Conf.numLengthSuppPoints
-
- if currentRun['enable_intron_length'] == False:
- currentRun['numLengthSuppPoints'] = 2 #Conf.numLengthSuppPoints
-
- currentRun['numFeatures'] = currentRun['numLengthSuppPoints']\
- + currentRun['numDonSuppPoints'] + currentRun['numAccSuppPoints']\
- + currentRun['matchmatrixRows'] * currentRun['matchmatrixCols']\
- + currentRun['totalQualSuppPoints']
-
- # run-specific settings
- currentRun['C'] = 100
-
- currentRun['name'] = currentName
- currentRun['alignment_dir'] = alignment_dir
-
- currentRun['min_intron_len'] = 20
- currentRun['max_intron_len'] = 2000
-
- currentRun['min_svm_score'] = 0.0
- currentRun['max_svm_score'] = 1.0
-
- currentRun['min_qual'] = -5
- currentRun['max_qual'] = 40
-
- currentRun['dna_flat_files'] = Conf.dna_flat_fn
-
- currentRun['id'] = ctr
- ctr += 1
-
- allRuns.append(currentRun)
-
-###############################################################################
-#
-# check for valid paths / options etc
-#
-###############################################################################
-
- for currentRun in allRuns:
-
- assert 0 < currentRun['anzpath'] < 100
- assert currentRun['iter_steps']
-
- #assert currentRun['matchmatrixCols']
- #assert currentRun['matchmatrixRows']
-
- assert currentRun['mode'] in ['normal','using_quality_scores']
-
- #assert currentRun['numConstraintsPerRound']
-
- assert 0 < currentRun['numFeatures'] < 10000
-
- # assert currentRun['numLengthSuppPoints']
- # assert currentRun['numDonSuppPoints']
- # assert currentRun['numAccSuppPoints']
- #assert currentRun['numQualPlifs']
- #assert currentRun['numQualSuppPoints']
- #assert numQualPlifs >= 0
- #assert numDonSuppPoints > 1
- #assert numAccSuppPoints > 1
- #assert numLengthSuppPoints > 1
- #assert numQualSuppPoints > 1
-
- assert currentRun['print_matrix'] in [True,False]
- assert 0 < currentRun['read_size'] < 100
- assert currentRun['remove_duplicate_scores'] in [True,False]
-
- assert currentRun['enable_quality_scores'] in [True,False]
- assert currentRun['enable_splice_signals'] in [True,False]
- assert currentRun['enable_intron_length'] in [True,False]
-
- #assert currentRun['totalQualSuppPoints']
- assert os.path.exists(currentRun['alignment_dir'])
-
- return allRuns
-
-if __name__ == '__main__':
- allRuns = createRuns()
- #pdb.set_trace()
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding:latin-1 -*-
-
-import cPickle
-import os
-import os.path
-import time
-import pdb
-import subprocess
-
-from qpalma_main import QPalma
-import Experiment as Exp
-
-class Model:
-
- allInstances = []
-
- def __init__(self):
- pass
-
- def createInstances(self):
-
- allRuns = Exp.createRuns()
-
- #pdb.set_trace()
-
- for currentRun in allRuns:
-
- currentInstance = QPalma(currentRun)
-
- #print 'instance created, starting to pickle configuration...'
- name = currentRun['name']
- instance_fn = '%s.pickle'%name
- fh = open(instance_fn,'w+')
- cPickle.dump(currentInstance,fh)
- fh.close()
-
- self.allInstances.append(instance_fn)
-
-
- def doSelection(self):
- for instance in self.allInstances:
- time.sleep(2)
- cmd = 'echo ./resurrect %s | qsub -l h_vmem=5.0G -cwd -j y -N \"%s.log\"'%(instance,instance)
- #print cmd
- os.system(cmd)
-
- #os.system('echo "./resurrect %s >out_%s.log 2>err_%s.log &"'%(instance,instance,instance))
-
-
- def doPrediction(self):
- current_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_single_run'
-
- for instance in self.allInstances:
- time.sleep(2)
- cmd = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s 1>_%s.log 2>%s.log & '%(instance,instance,instance)
- #cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s |\
- #qsub -l h_vmem=12.0G -cwd -j y -N \"%s.log\"'%(instance,instance)
- #os.system(cmd)
- obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
- #out,err = obj.communicate()
- #print out,err
-
-
-
-if __name__ == '__main__':
- m = Model()
- m.createInstances()
- m.doSelection()
- #m.doPrediction()
-
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# Written (W) 2008 Gunnar Rätsch, Fabio De Bona
-# Copyright (C) 2008 Max-Planck-Society
-
-import cPickle
-import sys
-import pdb
-import os
-import os.path
-import resource
-
-from qpalma.computeSpliceWeights import *
-from qpalma.set_param_palma import *
-from qpalma.computeSpliceAlignWithQuality import *
-
-from numpy.matlib import mat,zeros,ones,inf
-from numpy import inf,mean
-
-from ParaParser import ParaParser,IN_VECTOR
-
-from qpalma.Lookup import LookupTable
-from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement,unbracket_seq
-
-
-def cpu():
- return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
- resource.getrusage(resource.RUSAGE_SELF).ru_stime)
-
-
-class BracketWrapper:
- fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
-
- def __init__(self,filename):
- self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
- self.parser.parseFile(filename)
-
- def __len__(self):
- return self.parser.getSize(IN_VECTOR)
-
- def __getitem__(self,key):
- return self.parser.fetchEntry(key)
-
- def __iter__(self):
- self.counter = 0
- self.size = self.parser.getSize(IN_VECTOR)
- return self
-
- def next(self):
- if not self.counter < self.size:
- raise StopIteration
- return_val = self.parser.fetchEntry(self.counter)
- self.counter += 1
- return return_val
-
-
-class PipelineHeuristic:
- """
- This class wraps the filter which decides whether an alignment found by
- vmatch is spliced an should be then newly aligned using QPalma or not.
- """
-
- def __init__(self,run_fname,data_fname,param_fname,result_fname,settings):
- """
- We need a run object holding information about the nr. of support points
- etc.
- """
-
- run = cPickle.load(open(run_fname))
- self.run = run
-
- start = cpu()
- self.all_remapped_reads = BracketWrapper(data_fname)
- stop = cpu()
-
- print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
-
- self.chromo_range = settings['allowed_fragments']
-
- accessWrapper = DataAccessWrapper(settings)
- seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
-
- start = cpu()
- self.lt1 = LookupTable(settings)
- stop = cpu()
-
- print 'prefetched sequence and splice data in %f sec' % (stop-start)
-
- self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
- self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
-
- start = cpu()
-
- self.data_fname = data_fname
-
- self.param = cPickle.load(open(param_fname))
-
- # Set the parameters such as limits penalties for the Plifs
- [h,d,a,mmatrix,qualityPlifs] = set_param_palma(self.param,True,run)
-
- self.h = h
- self.d = d
- self.a = a
- self.mmatrix = mmatrix
- self.qualityPlifs = qualityPlifs
-
- self.read_size = 38
- self.prb_offset = 64
- #self.prb_offset = 50
-
- # parameters of the heuristics to decide whether the read is spliced
- #self.splice_thresh = 0.005
- self.splice_thresh = 0.01
- self.max_intron_size = 2000
- self.max_mismatch = 2
- #self.splice_stop_thresh = 0.99
- self.splice_stop_thresh = 0.8
- self.spliced_bias = 0.0
-
- param_lines = [\
- ('%f','splice_thresh',self.splice_thresh),\
- ('%d','max_intron_size',self.max_intron_size),\
- ('%d','max_mismatch',self.max_mismatch),\
- ('%f','splice_stop_thresh',self.splice_stop_thresh),\
- ('%f','spliced_bias',self.spliced_bias)]
-
- param_lines = [('# %s: '+form+'\n')%(name,val) for form,name,val in param_lines]
- param_lines.append('# data: %s\n'%self.data_fname)
- param_lines.append('# param: %s\n'%param_fname)
-
- #pdb.set_trace()
-
- for p_line in param_lines:
- self.result_spliced_fh.write(p_line)
- self.result_unspliced_fh.write(p_line)
-
- #self.original_reads = {}
-
- # we do not have this information
- #for line in open(reads_pipeline_fn):
- # line = line.strip()
- # id,seq,q1,q2,q3 = line.split()
- # id = int(id)
- # self.original_reads[id] = seq
-
- lengthSP = run['numLengthSuppPoints']
- donSP = run['numDonSuppPoints']
- accSP = run['numAccSuppPoints']
- mmatrixSP = run['matchmatrixRows']*run['matchmatrixCols']
- numq = run['numQualSuppPoints']
- totalQualSP = run['totalQualSuppPoints']
-
- currentPhi = zeros((run['numFeatures'],1))
- currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
- currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
- currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
- currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
-
- totalQualityPenalties = self.param[-totalQualSP:]
- currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
- self.currentPhi = currentPhi
-
- # we want to identify spliced reads
- # so true pos are spliced reads that are predicted "spliced"
- self.true_pos = 0
-
- # as false positives we count all reads that are not spliced but predicted
- # as "spliced"
- self.false_pos = 0
-
- self.true_neg = 0
- self.false_neg = 0
-
- # total time spend for get seq and scores
- self.get_time = 0.0
- self.calcAlignmentScoreTime = 0.0
- self.alternativeScoresTime = 0.0
-
- self.count_time = 0.0
- self.main_loop = 0.0
- self.splice_site_time = 0.0
- self.computeSpliceAlignWithQualityTime = 0.0
- self.computeSpliceWeightsTime = 0.0
- self.DotProdTime = 0.0
- self.array_stuff = 0.0
- stop = cpu()
-
- self.init_time = stop-start
-
- def filter(self):
- """
- This method...
- """
- run = self.run
-
- ctr = 0
- unspliced_ctr = 0
- spliced_ctr = 0
-
- print 'Starting filtering...'
- _start = cpu()
-
- for location,original_line in self.all_remapped_reads:
-
- if ctr % 1000 == 0:
- print ctr
-
- id = location['id']
- chr = location['chr']
- pos = location['pos']
- strand = location['strand']
- seq = location['seq']
- prb = location['prb']
-
- id = int(id)
-
- seq = seq.lower()
-
- strand_map = {'D':'+', 'P':'-'}
-
- strand = strand_map[strand]
-
- if not chr in self.chromo_range:
- continue
-
- unb_seq = unbracket_seq(seq)
-
- # forgot to do this
- if strand == '-':
- unb_seq = reverse_complement(unb_seq)
- seq = reverse_complement(seq)
-
- effective_len = len(unb_seq)
-
- genomicSeq_start = pos
- #genomicSeq_stop = pos+effective_len-1
- genomicSeq_stop = pos+effective_len
-
- start = cpu()
- #currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
- currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop)
-
- # just performed some tests to check LookupTable results
- #assert currentDNASeq_ == currentDNASeq
- #assert currentAcc_ == currentAcc_
- #assert currentDon_ == currentDon_
-
- stop = cpu()
- self.get_time += stop-start
-
- dna = currentDNASeq
- exons = zeros((2,1))
- exons[0,0] = 0
- exons[1,0] = effective_len
- est = unb_seq
- original_est = seq
- quality = map(lambda x:ord(x)-self.prb_offset,prb)
-
- currentVMatchAlignment = dna, exons, est, original_est, quality,\
- currentAcc, currentDon
-
- #pdb.set_trace()
-
- #alternativeAlignmentScores = self.calcAlternativeAlignments(location)
-
- try:
- alternativeAlignmentScores = self.calcAlternativeAlignments(location)
- except:
- alternativeAlignmentScores = []
-
- if alternativeAlignmentScores == []:
- # no alignment necessary
- maxAlternativeAlignmentScore = -inf
- vMatchScore = 0.0
- else:
- maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
- # compute alignment for vmatch unspliced read
- vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
- #print 'vMatchScore/alternativeScore: %f %f %s' % (vMatchScore,maxAlternativeAlignmentScore,strand)
-
-
- start = cpu()
-
- #print 'all candidates %s' % str(alternativeAlignmentScores)
-
- new_id = id - 1000000300000
-
- unspliced = False
- # unspliced
- if new_id > 0:
- unspliced = True
-
- # Seems that according to our learned parameters VMatch found a good
- # alignment of the current read
- if maxAlternativeAlignmentScore < vMatchScore:
- unspliced_ctr += 1
-
- self.result_unspliced_fh.write(original_line+'\n')
-
- if unspliced:
- self.true_neg += 1
- else:
- self.false_neg += 1
-
- # We found an alternative alignment considering splice sites that scores
- # higher than the VMatch alignment
- else:
- spliced_ctr += 1
-
- self.result_spliced_fh.write(original_line+'\n')
-
- if unspliced:
- self.false_pos += 1
- else:
- self.true_pos += 1
-
- ctr += 1
- stop = cpu()
- self.count_time = stop-start
-
- _stop = cpu()
- self.main_loop = _stop-_start
-
- print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
- print 'True pos / false pos : %d %d'%(self.true_pos,self.false_pos)
- print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)
-
-
- def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):
-
- def signum(a):
- if a>0:
- return 1
- elif a<0:
- return -1
- else:
- return 0
-
- proximal_acc = []
- for idx in xrange(max_intron_size, max_intron_size+read_size/2):
- if currentAcc[idx]>= splice_thresh:
- proximal_acc.append((idx,currentAcc[idx]))
-
- proximal_acc.sort(lambda x,y: signum(x[1]-y[1]))
- proximal_acc=proximal_acc[-2:]
-
- distal_acc = []
- for idx in xrange(max_intron_size+read_size, len(currentAcc)):
- if currentAcc[idx]>= splice_thresh and idx+read_size<len(currentAcc):
- distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
-
- #distal_acc.sort(lambda x,y: signum(x[1]-y[1]))
- #distal_acc=distal_acc[-2:]
-
- proximal_don = []
- for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
- if currentDon[idx] >= splice_thresh:
- proximal_don.append((idx, currentDon[idx]))
-
- proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
- proximal_don=proximal_don[-2:]
-
- distal_don = []
- for idx in xrange(1, max_intron_size):
- if currentDon[idx] >= splice_thresh and idx>read_size:
- distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
-
- distal_don.sort(lambda x,y: y[0]-x[0])
- #distal_don=distal_don[-2:]
-
- return proximal_acc,proximal_don,distal_acc,distal_don
-
-
- def calcAlternativeAlignments(self,location):
- """
- Given an alignment proposed by Vmatch this function calculates possible
- alternative alignments taking into account for example matched
- donor/acceptor positions.
- """
-
- run = self.run
-
- id = location['id']
- chr = location['chr']
- pos = location['pos']
- strand = location['strand']
- original_est = location['seq']
- quality = location['prb']
- quality = map(lambda x:ord(x)-self.prb_offset,quality)
-
- original_est = original_est.lower()
- est = unbracket_seq(original_est)
- effective_len = len(est)
-
- genomicSeq_start = pos - self.max_intron_size
- genomicSeq_stop = pos + self.max_intron_size + len(est)
-
- strand_map = {'D':'+', 'P':'-'}
- strand = strand_map[strand]
-
- if strand == '-':
- est = reverse_complement(est)
- original_est = reverse_complement(original_est)
-
- start = cpu()
- #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
- currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop)
- stop = cpu()
- self.get_time += stop-start
- dna = currentDNASeq
-
- proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
-
- alternativeScores = []
-
- #pdb.set_trace()
-
- # inlined
- h = self.h
- d = self.d
- a = self.a
- mmatrix = self.mmatrix
- qualityPlifs = self.qualityPlifs
- # inlined
-
- # find an intron on the 3' end
- _start = cpu()
- for (don_pos,don_score) in proximal_don:
- DonorScore = calculatePlif(d, [don_score])[0]
-
- for (acc_pos,acc_score,acc_dna) in distal_acc:
-
- IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
- AcceptorScore = calculatePlif(a, [acc_score])[0]
-
- #print 'don splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
-
- # construct a new "original_est"
- original_est_cut=''
-
- est_ptr=0
- dna_ptr=self.max_intron_size
- ptr=0
- acc_dna_ptr=0
- num_mismatch = 0
-
- while ptr<len(original_est):
- #print acc_dna_ptr,len(acc_dna),acc_pos,don_pos
-
- if original_est[ptr]=='[':
- dnaletter=original_est[ptr+1]
- estletter=original_est[ptr+2]
- if dna_ptr < don_pos:
- original_est_cut+=original_est[ptr:ptr+4]
- num_mismatch += 1
- else:
- if acc_dna[acc_dna_ptr]==estletter:
- original_est_cut += estletter # EST letter
- else:
- original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
- num_mismatch += 1
- #print '['+acc_dna[acc_dna_ptr]+estletter+']'
- acc_dna_ptr+=1
- ptr+=4
- else:
- dnaletter=original_est[ptr]
- estletter=dnaletter
-
- if dna_ptr < don_pos:
- original_est_cut+=estletter # EST letter
- else:
- if acc_dna[acc_dna_ptr]==estletter:
- original_est_cut += estletter # EST letter
- else:
- num_mismatch += 1
- original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
- #print '('+acc_dna[acc_dna_ptr]+estletter+')'
- acc_dna_ptr+=1
-
- ptr+=1
-
- if estletter=='-':
- dna_ptr+=1
- elif dnaletter=='-':
- est_ptr+=1
- else:
- dna_ptr+=1
- est_ptr+=1
- if num_mismatch>self.max_mismatch:
- continue
-
- assert(dna_ptr<=len(dna))
- assert(est_ptr<=len(est))
-
- #print original_est, original_est_cut
-
- score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
- score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
-
- alternativeScores.append(score)
-
- if acc_score>=self.splice_stop_thresh:
- break
-
- #pdb.set_trace()
-
- _stop = cpu()
- self.alternativeScoresTime += _stop-_start
-
- # find an intron on the 5' end
- _start = cpu()
- for (acc_pos,acc_score) in proximal_acc:
-
- AcceptorScore = calculatePlif(a, [acc_score])[0]
-
- for (don_pos,don_score,don_dna) in distal_don:
-
- DonorScore = calculatePlif(d, [don_score])[0]
- IntronScore = calculatePlif(h, [acc_pos-don_pos])[0]
-
- #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
-
- # construct a new "original_est"
- original_est_cut=''
-
- est_ptr=0
- dna_ptr=self.max_intron_size
- ptr=0
- num_mismatch = 0
- don_dna_ptr=len(don_dna)-(acc_pos-self.max_intron_size)-1
- while ptr<len(original_est):
-
- if original_est[ptr]=='[':
- dnaletter=original_est[ptr+1]
- estletter=original_est[ptr+2]
- if dna_ptr > acc_pos:
- original_est_cut+=original_est[ptr:ptr+4]
- num_mismatch += 1
- else:
- if don_dna[don_dna_ptr]==estletter:
- original_est_cut += estletter # EST letter
- else:
- original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
- num_mismatch += 1
- #print '['+don_dna[don_dna_ptr]+estletter+']'
- don_dna_ptr+=1
- ptr+=4
- else:
- dnaletter=original_est[ptr]
- estletter=dnaletter
-
- if dna_ptr > acc_pos:
- original_est_cut+=estletter # EST letter
- else:
- if don_dna[don_dna_ptr]==estletter:
- original_est_cut += estletter # EST letter
- else:
- original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
- num_mismatch += 1
- #print '('+don_dna[don_dna_ptr]+estletter+')'
- don_dna_ptr+=1
-
- ptr+=1
-
- if estletter=='-':
- dna_ptr+=1
- elif dnaletter=='-':
- est_ptr+=1
- else:
- dna_ptr+=1
- est_ptr+=1
-
- if num_mismatch>self.max_mismatch:
- continue
- assert(dna_ptr<=len(dna))
- assert(est_ptr<=len(est))
-
- #print original_est, original_est_cut
-
- score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
- score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
-
- alternativeScores.append(score)
-
- if don_score>=self.splice_stop_thresh:
- break
-
- _stop = cpu()
- self.alternativeScoresTime += _stop-_start
-
- return alternativeScores
-
-
- def calcAlignmentScore(self,alignment):
- """
- Given an alignment (dna,exons,est) and the current parameter for QPalma
- this function calculates the dot product of the feature representation of
- the alignment and the parameter vector i.e the alignment score.
- """
-
- start = cpu()
- run = self.run
-
- # Lets start calculation
- dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
-
- score = computeSpliceAlignScoreWithQuality(original_est, quality, self.qualityPlifs, run, self.currentPhi)
-
- stop = cpu()
- self.calcAlignmentScoreTime += stop-start
-
- return score
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding:latin-1 -*-
-
-# first step
-
-import qpalma.Configuration as Conf
-
-from pipeline.Pipeline import *
-
-import os.path
-import cPickle
-
-from PipelineHeuristic import *
-
-import glob
-import time
-
-
-def get_most_recent_param(current_dir):
- date_file_list = []
- for file in glob.glob(current_dir + '/param*'):
- stats = os.stat(file)
- lastmod_date = time.localtime(stats[8])
- date_file_tuple = lastmod_date, file
- date_file_list.append(date_file_tuple)
-
- date_file_list.sort()
- date_file_list.reverse()
-
- return date_file_list[0][1]
-
-
-class InitStep(PipelineStep):
-
- def __init__(self):
-
- result_dir = Conf.result_dir
-
-
-class FirstStep(PipelineStep):
- """
-
- """
-
- def __init__(self):
- jp = os.path.join
-
- # a list storing all commands for this particular step
- cL = []
-
-
- # fetch the config object which stores all relevant parameters
- Config = cPickle.load(open(Conf.conf_object_path))
-
- reads_0_1st = jp(Config['mapping_main_dir'],'reads_0.fl')
- cmd = 'fix_indels.pl %s > %s' % (Config['reads_location'],reads_0_1st)
- cL.append(cmd)
-
- # ATTENTION: Changing working directory
- os.chdir(Config['mapping_main_dir'])
-
- cmd = 'map_full_transcript.pl --mismatches=%d --end_gap=%d -no_edit_distance --read_length=%d --repeat_mapping=%d --seedlength=%d --dir=. --suffixtree=%s' % (\
- Config['mismatches_1'],\
- Config['end_gap_1'],\
- Config['read_size'],\
- Config['repeat_mapping_1'],\
- Config['seedlength_1'],\
- Config['suffixtree_1'])
-
- cL.append(cmd)
-
- reads_0_2nd = jp(Config['mapping_spliced_dir'],'reads_0.fl')
- cmd = 'add_quality_2_fl.pl reads_0.fl reads_3.fl > %s' % reads_0_2nd
-
- cL.append(cmd)
-
- # ATTENTION: Changing working directory
- os.chdir(Config['mapping_spliced_dir'])
-
- # ATTENTION: Whether we should use the filtering step is not clear
- #cmd = 'filter_quality.pl 8 reads_0.fl && mv reads_0.fl.filtered reads_0.fl'
- #cL.append(cmd)
-
- cmd = 'map_spliced_transcript.pl --mismatches=%d --sub_mismatches=%d --read_length=%d --min_short_end=%d --repeat_mapping=%d --seedlength=%d --dir=. --suffixtree=%s' % (\
- Config['mismatches_2'],\
- Config['sub_mismatches_2'],\
- Config['read_size'],\
- Config['min_short_end_2'],\
- Config['repeat_mapping_2'],\
- Config['seedlength_2'],\
- Config['suffixtree_2'])
-
- cL.append(cmd)
-
- for elem in cL:
- print elem
-
- PipelineStep.__init__(self,cL)
-
-
-class SecondStep(PipelineStep):
- """
- We can train
- """
-
- pass
-
-
-class ThirdStep(PipelineStep):
- """
- We use the QPalma heuristic to predict those reads that should be aligned by
- the QPalma alignment program even though vmatch found a full match with some
- mismatches.
- """
- jp = os.path.join
-
- # fetch the config object which stores all relevant parameters
- Config = cPickle.load(open(Conf.conf_object_path))
-
- data_fn = jp(Config['heuristic_dir'],'map.vm')
- param_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
- param_fn = get_most_recent_param(param_dir)
- run_fn = jp(param_dir,'run_obj.pickle')
-
- result_spliced_fn = jp(Config['heuristic_dir'],'spliced.heuristic')
- result_unspliced_fn = jp(Config['heuristic_dir'],'unspliced.heuristic')
-
- ph1 = PipelineHeuristic(run_fn,data_fn,param_fn,result_spliced_fn,result_unspliced_fn)
- ph1.filter()
-
-
-class FourthStep(PipelineStep):
- """
-
- """
-
-
-if __name__ == '__main__':
- step1 = FirstStep()
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding:latin-1 -*-
-
-
-class Run(dict):
- """
- This class represents the needed parameters for one single run of an
- algorithm. This means that if we for example perform a model selection over
- C then for each value of C we have to create one Run object storing the
- respective parameters.
- """
-
- def __init__(self):
- pass
-
- def __repr__(self):
-
- result = ""
- for key,val in self.iteritems():
- result += "%s:\t\t %s\n" % (key,str(val))
-
- return result
-
-if __name__ == '__main__':
- r1 = Run()
- r1['attrib_1'] = 12
- r1['attrib_2'] = 22
-
- print "%s" % r1
-
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-#
-#
-#
-#
-
-import os.path
-import cPickle
-
-import QPalmaConfiguration as Conf
-
-
-def check_vmatch_params(Config):
- #
- # Check and set parameters for the first VMatch step
- #
-
- Config['mismatches_1'] = Conf.mismatches_1
- Config['end_gap_1'] = Conf.end_gap_1
- Config['read_length_1'] = Conf.read_length_1
- Config['repeat_mapping_1'] = Conf.repeat_mapping_1
- Config['seedlength_1'] = Conf.seedlength_1
- Config['suffixtree_1'] = Conf.suffixtree_1
-
- assert 0 <= Config['mismatches_1'] <= Config['read_size']
- assert 0 <= Config['end_gap_1'] <= 5
- assert 0 <= Config['repeat_mapping_1'] <= 10
- assert 2 <= Config['seedlength_1'] <= Config['read_size']
- assert os.path.exists( Config['suffixtree_1'] )
-
- #
- # Check and set parameters for the second VMatch step
- #
-
- Config['mismatches_2'] = Conf.mismatches_2
- Config['sub_mismatches_2'] = Conf.sub_mismatches_2
- Config['min_short_end_2'] = Conf.min_short_end_2
- Config['repeat_mapping_2'] = Conf.repeat_mapping_2
- Config['seedlength_2'] = Conf.seedlength_2
-
- assert 0 <= Config['mismatches_2'] <= Config['read_size']
- assert 0 <= Config['sub_mismatches_2'] <= Config['read_size']
- assert 0 <= Config['min_short_end_2'] <= Config['read_size']
- assert 0 <= Config['repeat_mapping_2'] <= Config['read_size']
- assert os.path.exists( Config['suffixtree_2'] )
-
-
-def check_and_init():
- """
- The purpose of this script is to take all the global variables from the
- Configuration file and store them into a dictionary for the pipeline.
-
- Additionally sanity checks are performed for the parameters to be sure they
- are within a certain interval or the file they point to exists etc.
- """
-
- jp = os.path.join
-
- # create a python dictionary to store all configuration parameters
- Config = {}
-
- result_dir = Conf.result_dir
- assert os.path.exists(result_dir), 'Error you have to specify an existing result_dir.'
-
- # assing main result dir in Config dictionary
- Config['result_dir'] = result_dir
-
- Config['reads_location'] = Conf.reads_location
- #assert os.path.exists(Config['reads_location'])
-
- Config['read_size'] = Conf.read_size
- assert 2 <= Config['read_size'] <= 100
-
- # Check VMatch related parameters
- check_vmatch_params(Config)
-
- subdirs = []
-
- # create subdirs needed for mapping / alignment / remapping
- mapping_dir = jp(result_dir,'mapping')
- Config['mapping_dir'] = mapping_dir
- Config['mapping_main_dir'] = jp(mapping_dir,'main')
- Config['mapping_spliced_dir'] = jp(mapping_dir,'spliced')
-
- subdirs.extend([Config['mapping_dir'], Config['mapping_main_dir'],\
- Config['mapping_spliced_dir']])
-
- Config['alignment_dir'] = jp(result_dir,'alignment')
-
- subdirs.extend([Config['alignment_dir']])
-
- Config['remapping_dir'] = jp(result_dir,'remapping')
-
- subdirs.extend([Config['remapping_dir']])
-
- #try:
- # for current_dir in subdirs:
- # os.mkdir(current_dir)
- #except:
- # print 'Error during initialization of project directories'
-
- # store the configuration in a pickled python dictionary
- cPickle.dump(Config,open(Conf.conf_object_path,'w+'))
-
-
-if __name__ == '__main__':
- check_and_init()
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-#
-# The purpose of this script is to check the created dataset pickle files for
-# consistency before doing any kind of training/prediction on the data.
-#
-
-import sys
-import pdb
-import cPickle
-
-from qpalma.sequence_utils import get_seq_and_scores,unbracket_seq,reverse_complement
-
-
-def checkAll(filename):
- """
- This function loads the dataset and performs some sanity checks and
- assertions to be sure that the set is in the right shape for QPalma to train
- resp. to predict on.
- """
-
- dataset = cPickle.load(open(filename))
-
- # we take the first quality vector of the tuple of quality vectors
- quality_index = 0
-
- status = True
- mes = '---'
-
- idx = 0
- for example_key in dataset.keys():
- matches = dataset[example_key]
- print 'Current example %d has %d matches' % (example_key,len(matches))
-
- for example in matches:
- (currentSeqInfo,original_est,currentQualities) = example
-
- (id,chromo,strand,genomicSeq_start,genomicSeq_stop) = currentSeqInfo
-
- assert chromo in range(1,6), pdb.set_trace()
- assert strand in ['+','-'], pdb.set_trace()
-
- quality = currentQualities[quality_index]
-
- # check for key consistency
- assert id == example_key
-
- if idx > 0 and idx % 1000 == 0:
- print 'processed %d examples' % idx
-
- dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
- genomic_seq_pos,acc_pos,don_pos = get_seq_and_scores(chromo,'+',genomicSeq_start,genomicSeq_stop,dna_flat_files,False)
- genomic_seq_neg,acc_neg,don_neg = get_seq_and_scores(chromo,'-',genomicSeq_start,genomicSeq_stop,dna_flat_files,False)
-
- assert reverse_complement(genomic_seq_neg) == genomic_seq_pos
-
-
- return status,mes
-
-
-if __name__ == '__main__':
- dataset_fn = sys.argv[1]
- status,mes = checkAll(dataset_fn )
-
- if status == True:
- print 'Dataset %s seems to be consistent.' % dataset_fn
- else:
- print 'Dataset %s seems to be inconsistent!' % dataset_fn
- print mes
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# Written (W) 2008 Fabio De Bona
-# Copyright (C) 2008 Max-Planck-Society
-
-import cPickle
-import math
-import numpy
-import os
-import os.path
-import pdb
-import sys
-
-from qpalma.utils import pprint_alignment
-
-import palma.palma_utils as pu
-from palma.output_formating import print_results
-
-from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
-
-# a little auxiliary function
-pp = lambda x : str(x)[1:-1].replace(' ','')
-
-
-def alignment_reconstruct(current_prediction,num_exons):
- """
- We reconstruct the exact alignment to infer the positions of indels and the
- sizes of the respective exons.
- """
-
- translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
-
- SpliceAlign = current_prediction['spliceAlign']
- estAlign = current_prediction['estAlign']
-
- dna_array = current_prediction['dna_array']
- read_array = current_prediction['read_array']
-
- dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
- read_array = "".join(map(lambda x: translation[int(x)],read_array))
-
- # this array holds a number for each exon indicating its number of matches
- exonIdentities = [0]*num_exons
- exonGaps = [0]*num_exons
-
- gaps = 0
- identity = 0
- idx = 0
-
- for ridx in range(len(read_array)):
- read_elem = read_array[ridx]
- dna_elem = dna_array[ridx]
-
- if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
- idx += 1
- gaps = 0
- identity = 0
-
- if read_elem == '_':
- continue
-
- if read_elem == dna_elem:
- identity += 1
-
- if read_elem == '-':
- gaps += 1
-
- exonIdentities[idx] = identity
- exonGaps[idx] = gaps
-
- return exonIdentities,exonGaps
-
-
-def create_alignment_file(current_loc,out_fname):
-
- out_fh = open(out_fname,'w+')
- allPredictions = cPickle.load(open(current_loc))
- #allPredictions = current_loc
-
- print 'Loaded %d examples' % len(allPredictions)
-
- # fetch the data needed
- accessWrapper = DataAccessWrapper(settings)
- seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
-
- # Uniqueness filtering using alignment score for reads with several
- # alignments
- allUniquePredictions = {}
-
- for new_prediction in allPredictions:
- id = new_prediction['id']
- if allUniquePredictions.has_key(id):
- current_prediction = allUniquePredictions[id]
-
- current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
- new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
-
- if current_a_score < new_score :
- allUniquePredictions[id] = new_prediction
-
- else:
- allUniquePredictions[id] = new_prediction
-
- written_lines = 0
- for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
-
- id = current_prediction['id']
-
- seq = current_prediction['read']
- dna = current_prediction['dna']
-
- # CHECK !!!
- q1 = 'zzz'
-
- chromo = current_prediction['chr']
- strand = current_prediction['strand']
- start_pos = current_prediction['start_pos']
- alignment = current_prediction['alignment']
-
- DPScores = current_prediction['DPScores']
- predExons = current_prediction['predExons']
- predExons = [e+start_pos for e in predExons]
-
- (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
- tExonSizes,tStarts, tEnds) = alignment
-
- if len(qExonSizes) != num_exons:
- print 'BUG exon sizes %d'%id
- continue
-
- EST2GFF = False
- new_line = ''
-
- if EST2GFF:
- predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
-
- match = predExons[0,1] - predExons[0,0]
- if predExons.shape == (2,2):
- match += predExons[1,1] - predExons[1,0]
-
- mismatch = 0
- repmatch = 0
- Ns = 0
- Qgapcnt = 0
- Qgapbs = 0
-
- Tgapcnt = 0
- Tgapbs = 0
-
- if predExons.shape == (2,2):
- Tgapbs += predExons[1,0] - predExons[0,1]
- Tgapcnt = 1
-
- # ATTENTION
- #
- # we enforce equal exons sizes for q and t because we are missing indel
- # positions of the alignment
-
- if qExonSizes[0] != tExonSizes[0]:
- continue
-
- Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
- Qsize = len(seq)
- Qstart = qStart
- Qend = qEnd
- Tname = 'CHR%d'%chromo
-
- start_pos -= 2
-
- Tsize = tEnd+1 + start_pos
- Tstart = tStart + start_pos
- Tend = tEnd + start_pos
- blockcnt = Tgapcnt+1
- exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
- Qstarts_str = str(qStarts)[1:-1].replace(' ','')
- Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
-
- new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
- % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
- Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
- Tname, Tsize, Tstart, Tend,\
- blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
-
- else:
- exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
-
-
- new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
- (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
- pp(qExonSizes),pp(qStarts),\
- pp(qEnds),pp(tExonSizes),\
- pp(tStarts),pp(tEnds),\
- pp(exonIdentities),pp(exonGaps))
-
- out_fh.write(new_line)
- written_lines += 1
-
- print 'Wrote out %d lines' % written_lines
-
-
-def run(chunk_dir,outfile):
- """
- The run...
- """
-
- out_fh = open(outfile,'w+')
-
- # fetch the data needed
- accessWrapper = DataAccessWrapper(settings)
- seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
-
- for line in open(result_fn):
- sl = line.split()
-
- chromo = int(sl[1])
- strand = sl[2]
- start_pos = int(sl[5])-2
- starts = sl[15].split(',')
- ends = sl[16].split(',')
-
- ids = sl[-2].split(',')
- ids = [int(elem) for elem in ids]
- gaps = sl[-1].split(',')
- gaps = [int(elem) for elem in gaps]
-
- if strand == '+':
- pass
- elif strand == '-':
- total_size = seqInfo.chromo_sizes[chromo]
- start_pos = total_size - start_pos
- else:
- assert False
-
- starts = [int(elem) + start_pos for elem in starts]
- ends = [int(elem) + start_pos for elem in ends]
-
- if strand == '+':
- starts = [e+1 for e in starts]
- ends = [e+1 for e in ends]
- else:
- starts = [e-3001 for e in starts]
- ends = [e-3001 for e in ends]
-
- out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
+++ /dev/null
-#!/bin/bash
-
-cat $1 | grep correct | cut -f 3 > correct.coverage_labels
-cat $1 | grep wrong | cut -f 3 > wrong.coverage_labels
-
-( cat <<EOF
-load correct.coverage_labels;
-load wrong.coverage_labels;
-correct_vec = zeros(10,1);
-wrong_vec = zeros(10,1);
-for i=1:10,
- correct_vec(i) = sum([correct == (i-1)]);
- wrong_vec(i) = sum([wrong == (i-1)]);
-end
-bar(correct_vec,1)
-hold;
-set(gca,'xlim',[0.5 length(correct_vec)+0.5])
-xlabel('Confirmation count')
-ylabel('Frequency')
-set(gca,'XTick',[0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5])
-set(gca,'XTickLabel',[0,1,2,3,4,5,6,7,8,9,10])
-saveas(gcf,'pos_cov.eps')
-clf;
-bar(wrong_vec,1);
-hold;
-set(gca,'xlim',[0.5 length(correct_vec)+0.5])
-xlabel('Confirmation count')
-ylabel('Frequency')
-set(gca,'XTick',[0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5])
-set(gca,'XTickLabel',[0,1,2,3,4,5,6,7,8,9,10])
-saveas(gcf,'neg_cov.eps')
-EOF
-) > test_script.m
-
-matlab -nojvm -r "test_script;" || echo -e "\nmatlab script failed!\n"
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from qpalma.DataProc import *
-from compile_dataset import getSpliceScores, get_seq_and_scores
-
-def debugDataset():
- filename = 'dataset_remapped_test_new'
-
- SeqInfo, Exons, OriginalEsts, Qualities,\
- AlternativeSequences = paths_load_data(filename,None,None,None)
-
- beg = 0
- end = 5000
-
- SeqInfo = SeqInfo[beg:end]
- Exons = Exons[beg:end]
- OriginalEsts= OriginalEsts[beg:end]
- Qualities = Qualities[beg:end]
- AlternativeSequences = AlternativeSequences[beg:end]
-
- for exampleIdx in range(4579,4580):
- currentSeqInfo = SeqInfo[exampleIdx]
- chr,strand,up_cut,down_cut = currentSeqInfo
-
- dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
- dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
-
- currentAlternatives = AlternativeSequences[exampleIdx]
- for alternative_alignment in currentAlternatives:
- chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel = alternative_alignment
- if not chr in range(1,6):
- return
-
- print chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel
- #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
-
- pdb.set_trace()
-
-if __name__ == '__main__':
- debugDataset()
-
+++ /dev/null
-#!/bin/bash
-
-g_config=/fml/ag-raetsch/share/databases/genomes/A_thaliana/arabidopsis_tair7/genebuild/genome.config
-
-# First we count the numbers of incorrect gt and gc positions for the full
-# alignment set not filtered by coverage numbers etc.
-
-alignment_file=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/full.align.consistent.unique
-
-touch gt_dpscores.hist && rm gt_dpscores.hist
-touch gc_dpscores.hist && rm gc_dpscores.hist
-
-for CHR in "CHR1" "CHR2" "CHR3" "CHR4" "CHR5"
-do
- /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR + psl2gff.result psl2gff_2.result $g_config \
- | grep -A1 '###' | grep -B1 'gt..\.' | grep check_psl >> gt_dpscores.hist
- #| grep -A1 '###' | grep -B1 'gt..\.' | grep check_psl |cut -f2 -d '(' | cut -f1 -d ')' >> gt_dpscores.hist
-
- /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR + psl2gff.result psl2gff_2.result $g_config \
- | grep -A1 '###' | grep -B1 'gc..\.' | grep check_psl >> gc_dpscores.hist
- #| grep -A1 '###' | grep -B1 'gc..\.' | grep check_psl |cut -f2 -d '(' | cut -f1 -d ')' >> gc_dpscores.hist
-done
-
-# Now we determine the numbers of incorrect gt and gc positions for the full
-# alignment set FILTERED by coverage numbers.
-
-alignment_file=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/full.align.coverage_filtered
-
-touch gt_dpscores.hist.coverage_filtered && rm gt_dpscores.hist.coverage_filtered
-touch gc_dpscores.hist.coverage_filtered && rm gc_dpscores.hist.coverage_filtered
-
-for CHR in "CHR1" "CHR2" "CHR3" "CHR4" "CHR5"
-do
- /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR + psl2gff.result psl2gff_2.result $g_config \
- | grep -A1 '###' | grep -B1 'gt..\.' | grep check_psl >> gt_dpscores.hist.coverage_filtered
- #| grep -A1 '###' | grep -B1 'gt..\.' | grep check_psl |cut -f2 -d '(' | cut -f1 -d ')' >> gt_dpscores.hist.coverage_filtered
-
- /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR + psl2gff.result psl2gff_2.result $g_config \
- | grep -A1 '###' | grep -B1 'gc..\.' | grep check_psl >> gc_dpscores.hist.coverage_filtered
- #| grep -A1 '###' | grep -B1 'gc..\.' | grep check_psl |cut -f2 -d '(' | cut -f1 -d ')' >> gc_dpscores.hist.coverage_filtered
-done
-
-cat gt_dpscores.hist | sort > gt_dpscores.hist.sorted
-cat gt_dpscores.hist.coverage_filtered | sort > gt_dpscores.hist.coverage_filtered.sorted
-diff --suppress-common-lines gt_dpscores.hist.coverage_filtered.sorted gt_dpscores.hist.sorted > DIFF
+++ /dev/null
-#!/bin/bash
-
-#
-# This script takes the alignment output of QPalma and uses the script psl2gff
-# to cluster the reads together in order to infer gene structures.
-#
-# The result is a (are several) gff file(s) that are used as input to the
-# compute_splicegraph script of the genebuild project.
-#
-
-g_config=/fml/ag-raetsch/share/databases/genomes/A_thaliana/arabidopsis_tair7/genebuild/genome.config
-
-# First we count the numbers of incorrect gt and gc positions for the full
-# alignment set not filtered by coverage numbers etc.
-
-alignment_file=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/full.align.consistent.unique
-
-for CHR in "CHR1" "CHR2" "CHR3" "CHR4" "CHR5"
-do
- for STRAND in "+" "-"
- do
- echo $CHR $STRAND
- result1_fn=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/SpliceGraphResults/gff/test_myexons_${CHR}${STRAND}.gff
- result2_fn=/dev/null
-
- /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR ${STRAND} $result1_fn $result2_fn $g_config 1>>LOG
- done
-done
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import pdb
-import os
-import os.path
-
-
-def compare_alignments(aln1,aln2):
- return
-
-
-def compare_all_alignments(reads_fn,vmatch_result_dir):
- read_positions = {}
-
- vmatch_correct_ctr = 0
-
- for line in open(reads_fn):
- line = line.strip()
- slist = line.split()
- id = int(slist[0])
- start_pos = int(slist[10])
- end_pos = int(slist[13])
-
- read_positions[id] = (start_pos,end_pos)
-
- for line in open(vmatch_result_dir):
- line = line.strip()
- slist = line.split()
-
- id = int(slist[0])
- vmatch_start_pos = int(slist[2])
-
- start_pos,end_pos = read_positions[id]
-
- if vmatch_start_pos == start_pos:
- vmatch_correct_ctr += 1
-
-
-if __name__ == '__main__':
- reads_fn = ''
- vmatch_result_dir = ''
-
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import cPickle
-import sys
-import time
-import pdb
-import os
-import os.path
-import math
-
-from pythongrid import KybJob, Usage
-from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
-
-from createAlignmentFileFromPrediction import create_alignment_file
-
-import grid_alignment
-
-
-def g_alignment(chunk_fn,result_fn):
- create_alignment_file(chunk_fn,result_fn)
-
-
-def create_and_submit():
- jp = os.path.join
-
- run_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/prediction'
- result_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/alignment'
-
- chunks_fn = []
- for fn in os.listdir(run_dir):
- if fn.startswith('chunk'):
- chunks_fn.append(fn)
-
- #chunks_fn = [\
- #'chunk_9.predictions.pickle',\
- #'chunk_14.predictions.pickle',\
- #'chunk_24.predictions.pickle']
-
- print chunks_fn
-
- functionJobs=[]
-
- for chunk_fn in chunks_fn:
- chunk_name = chunk_fn[:chunk_fn.find('.')]
- result_fn = jp(result_dir,'%s.align_remap'%chunk_name)
- chunk_fn = jp(run_dir,chunk_fn)
-
- #pdb.set_trace()
-
- current_job = KybJob(grid_alignment.g_alignment,[chunk_fn,result_fn])
- current_job.h_vmem = '15.0G'
- current_job.express = 'True'
-
- print "job #1: ", current_job.nativeSpecification
-
- functionJobs = [current_job]
- #functionJobs.append(current_job)
- (sid, jobids) = submit_jobs(functionJobs)
- time.sleep(10)
-
-
-if __name__ == '__main__':
- create_and_submit()
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# Written (W) 2008 Fabio De Bona
-# Copyright (C) 2008 Max-Planck-Society
-
-import cPickle
-import sys
-import pdb
-import time
-import os
-import os.path
-import math
-
-from pythongrid import KybJob, Usage
-from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
-
-from PipelineHeuristic import *
-
-import grid_heuristic
-
-from Utils import split_file
-
-
-def g_heuristic(run_fname,data_fname,param_fname,result_fname):
- #print run_fname,data_fname,param_fname,result_fname
- ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_fname)
- ph1.filter()
-
- return 'finished filtering set %s.' % data_fname
-
-
-def create_and_submit():
- jp = os.path.join
-
- num_splits = 50
-
- run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
- #data_dir = '/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/'
-
- data_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/main/'
-
- run_fname = jp(run_dir,'run_obj.pickle')
-
- original_map_fname = jp(data_dir,'map.vm')
-
- split_file(original_map_fname,data_dir,num_splits)
-
- param_fname = jp(run_dir,'param_526.pickle')
-
- functionJobs=[]
-
- for idx in range(0,num_splits):
- data_fname = jp(data_dir,'map.part_%d'%idx)
- result_fname = jp(data_dir,'map.vm.part_%d.heuristic'%idx)
-
- #pdb.set_trace()
-
- current_job = KybJob(grid_heuristic.g_heuristic,[run_fname,data_fname,param_fname,result_fname])
- current_job.h_vmem = '25.0G'
- #current_job.express = 'True'
-
- print "job #1: ", current_job.nativeSpecification
-
- functionJobs.append(current_job)
- #break
-
- (sid, jobids) = submit_jobs(functionJobs)
- #print 'checking whether finished'
- #while not get_status(sid, jobids):
- # time.sleep(7)
- #print 'collecting jobs'
- #retjobs = collect_jobs(sid, jobids, functionJobs)
- #print "ret fields AFTER execution on cluster"
- #for (i, job) in enumerate(retjobs):
- # print "Job #", i, "- ret: ", job.ret
-
- #print '--------------'
-
-
-if __name__ == '__main__':
- #split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)
- create_and_submit()
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import cPickle
-import sys
-import time
-import pdb
-import os
-import os.path
-import math
-
-from pythongrid import KybJob, Usage
-from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
-
-from qpalma_main import *
-
-import grid_predict
-
-
-def get_slices(dataset_size,num_nodes):
- all_instances = []
-
- part = dataset_size / num_nodes
- begin = 0
- end = 0
- for idx in range(1,num_nodes+1):
-
- if idx == num_nodes:
- begin = end
- end = dataset_size
- else:
- begin = end
- end = begin+part
-
- params = (begin,end)
-
- all_instances.append(params)
-
- return all_instances
-
-
-def makeJobs(run,dataset_fn,chunks,param_fn):
- """
- """
-
- jobs=[]
-
- # fetch the data needed
- g_dir = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
- acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
- don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
-
- g_fmt = 'chr%d.dna.flat'
- s_fmt = 'contig_%d%s'
-
- num_chromo = 6
-
- accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
- seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
-
- for c_name,current_chunk in chunks:
- current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param_fn,seqInfo,c_name])
- current_job.h_vmem = '20.0G'
- #current_job.express = 'True'
-
- print "job #1: ", current_job.nativeSpecification
-
- jobs.append(current_job)
-
- return jobs
-
-
-def create_and_submit():
- """
-
- """
-
- jp = os.path.join
-
- run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
-
- run = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
- run['name'] = 'saved_run'
-
- param_fn = jp(run_dir,'param_526.pickle')
-
- #run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/prediction'
- #dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset/dataset_run_1.pickle.pickle'
- #prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset/dataset_run_1.pickle.keys.pickle'
-
- run['result_dir'] = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/prediction'
- dataset_fn = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/dataset_run_1.pickle'
- prediction_keys_fn = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/dataset_run_1.keys.pickle'
-
- prediction_keys = cPickle.load(open(prediction_keys_fn))
-
- print 'Found %d keys for prediction.' % len(prediction_keys)
-
- num_splits = 50
- #num_splits = 1
- slices = get_slices(len(prediction_keys),num_splits)
- chunks = []
- for idx,slice in enumerate(slices):
- #if idx != 0:
- c_name = 'chunk_%d' % idx
- chunks.append((c_name,prediction_keys[slice[0]:slice[1]]))
-
- functionJobs = makeJobs(run,dataset_fn,chunks,param_fn)
-
- sum = 0
- for size in [len(elem) for name,elem in chunks]:
- sum += size
-
- print 'Got %d job(s)' % len(functionJobs)
-
- #print "output ret field in each job before sending it onto the cluster"
- #for (i, job) in enumerate(functionJobs):
- # print "Job with id: ", i, "- ret: ", job.ret
- #print ""
- #print "sending function jobs to cluster"
- #print ""
-
- (sid, jobids) = submit_jobs(functionJobs)
-
-
-def g_predict(run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name):
- """
-
- """
-
- qp = QPalma(False)
- qp.init_prediction(run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name)
- return 'finished prediction of set %s.' % set_name
-
-
-if __name__ == '__main__':
- create_and_submit()
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-
-import cPickle
-import sys
-
-filename = sys.argv[1]
-
-predict = cPickle.load(open(filename))
-
-correct = {}
-off = {}
-wrong = {}
-
-split_count = [0]*36
-
-for elem in predict:
- current_elem = elem['realRestPos']
-
- split_count[current_elem] += 1
-
- if len(elem['predExons']) != 4:
-
- try:
- wrong[current_elem] += 1
- except:
- wrong[current_elem] = 1
-
- else:
- e1b = elem['e1_b_off']
- e1e = elem['e1_e_off']
- e2b = elem['e2_b_off']
- e2e = elem['e2_e_off']
-
- if e1b == 0 and e1e == 0 and e2b == 0 and e2e == 0:
- try:
- correct[current_elem] += 1
- except:
- correct[current_elem] = 1
-
- if not (e1b == 0 and e1e == 0 and e2b == 0 and e2e == 0):
- try:
- off[current_elem] += 1
- except:
- off[current_elem] = 1
-
-import pylab
-
-print split_count
-
-y_values = []
-for overlap in range(36):
- result = 0
- try:
- cv = off[overlap]
- except:
- cv = 0
-
- result += cv
-
- try:
- cv = wrong[overlap]
- except:
- cv = 0
-
- result += cv
-
- if split_count[overlap] > 0:
- result = result/(1.0*split_count[overlap])
- else:
- result = 0
-
- y_values.append(result)
-
-outfile = filename.replace('_allPredictions','')
-
-import numpy
-import pylab
-pylab.bar(range(36),y_values,width=0.2,align='center')
-pylab.xticks( numpy.arange(0,36,1) )
-pylab.savefig('%s_error_vs_overlap.eps'%outfile)
-pylab.clf()
-#raw_input()
+++ /dev/null
-#!/bin/bash
-
-instance_ctr=1
-
-for instance in `ls -1 run*.pickle`
-do
- instance_dir=${instance/\.pickle/}
-
- rt="/fml/ag-raetsch/home/fabio/tmp/QPalma"
- newest_param=`ls -t ${rt}/${instance_dir}/param_* | head -n1`
- echo $newest_param
-
- script=predictAll_${instance_ctr}.py
-
- echo "from qpalma_main import QPalma" >> $script
- echo "import sys" >> $script
- echo "from cPickle import *" >> $script
- echo "import io_pickle" >> $script
- echo "dataset = io_pickle.load('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_20k')" >> $script
-
- echo "ma=load(open('${instance}'))" >> $script
- echo "ma.run['dataset_filename'] = dataset" >> $script
-
- #echo "ma.run['training_begin'] = 0" >> $script
- #echo "ma.run['training_end'] = 100" >> $script
-
- echo "ma.run['prediction_begin'] = 0" >> $script
- echo "ma.run['prediction_end'] = 20000" >> $script
-
- echo "ma.run['min_intron_len'] = 20" >> $script
- echo "ma.run['max_intron_len'] = 2000" >> $script
- echo "ma.run['min_svm_score'] = 0.0 " >> $script
- echo "ma.run['max_svm_score'] = 1.0" >> $script
- echo "ma.run['min_qual'] = -5" >> $script
- echo "ma.run['max_qual'] = 40" >> $script
- echo "qpalma1 = QPalma(ma.run)" >> $script
- echo "qpalma1.evaluate('${newest_param}',dataset)" >> $script
-
- script_startup=${script}_startup
-
- echo "source ~/.bashrc" >> $script_startup
- echo "export ILOG_LICENSE_FILE=/fml/ag-raetsch/share/software/ilog/ilm/access2.ilm" >> $script_startup
- echo "python $script" >> $script_startup
-
-
- echo python $script_startup | qsub -l h_vmem=3.0G -cwd -j y -N inst_${instance}.log
-
- instance_ctr=$((instance_ctr+1))
-done
+++ /dev/null
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# Written (W) 2008 Fabio De Bona
-# Copyright (C) 2008 Max-Planck-Society
-
-import array
-import cPickle
-import os.path
-import pdb
-import sys
-
-from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
-
-import numpy
-from numpy.matlib import mat,zeros,ones,inf
-from numpy.linalg import norm
-
-#from qpalma.SIQP_CPX import SIQPSolver
-#from qpalma.SIQP_CVXOPT import SIQPSolver
-
-import QPalmaDP
-import qpalma
-from qpalma.computeSpliceWeights import *
-from qpalma.set_param_palma import *
-from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf,compute_donacc
-
-from qpalma.utils import pprint_alignment, get_alignment
-
-class SpliceSiteException:
- pass
-
-
-def getData(training_set,exampleKey,run):
- """ This function... """
-
- currentSeqInfo,original_est,currentQualities,currentExons = training_set[exampleKey]
- id,chr,strand,up_cut,down_cut = currentSeqInfo
-
- est = original_est
- est = "".join(est)
- est = est.lower()
- est = unbracket_est(est)
- est = est.replace('-','')
-
- assert len(est) == run['read_size'], pdb.set_trace()
- est_len = len(est)
-
- #original_est = OriginalEsts[exampleIdx]
- original_est = "".join(original_est)
- original_est = original_est.lower()
-
- dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
- dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
-
- #currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
-
- original_exons = currentExons
- exons = original_exons - (up_cut-1)
- exons[0,0] -= 1
- exons[1,0] -= 1
-
- if exons.shape == (2,2):
- fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
-
- donor_elem = dna[exons[0,1]:exons[0,1]+2]
- acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
-
- if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
- print 'invalid donor in example %d'% exampleKey
- raise SpliceSiteException
-
- if not ( acceptor_elem == 'ag' ):
- print 'invalid acceptor in example %d'% exampleKey
- raise SpliceSiteException
-
- assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
-
- return dna,est,acc_supp,don_supp,exons,original_est,currentQualities
-
-
-class QPalma:
- """
- This class wraps the training and prediction functions for
- the alignment.
- """
-
- def __init__(self,run,seqInfo,dmode=False):
- self.ARGS = Param()
- self.qpalma_debug_mode = dmode
- self.run = run
- self.seqInfo = seqInfo
-
-
- def plog(self,string):
- self.logfh.write(string)
- self.logfh.flush()
-
-
- def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
- """
- Given the needed input this method calls the QPalma C module which
- calculates a dynamic programming in order to obtain an alignment
- """
-
- dna_len = len(dna)
- est_len = len(est)
-
- prb = QPalmaDP.createDoubleArrayFromList(quality)
- chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
- matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
- mm_len = self.run['matchmatrixRows']*self.run['matchmatrixCols']
-
- d_len = len(donor)
- donor = QPalmaDP.createDoubleArrayFromList(donor)
- a_len = len(acceptor)
- acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
- # Create the alignment object representing the interface to the C/C++ code.
- currentAlignment = QPalmaDP.Alignment(self.run['numQualPlifs'],self.run['numQualSuppPoints'], self.use_quality_scores)
- c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
- # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
- currentAlignment.myalign( current_num_path, dna, dna_len,\
- est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
- acceptor, a_len, c_qualityPlifs, self.run['remove_duplicate_scores'],\
- self.run['print_matrix'] )
-
- if prediction_mode:
- # part that is only needed for prediction
- result_len = currentAlignment.getResultLength()
- dna_array,est_array = currentAlignment.getAlignmentArraysNew()
- else:
- dna_array = None
- est_array = None
-
- newSpliceAlign, newEstAlign, newWeightMatch, newDPScores, newQualityPlifsFeatures =\
- currentAlignment.getAlignmentResultsNew()
-
- return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
- newQualityPlifsFeatures, dna_array, est_array
-
-
- def init_train(self,training_set):
- full_working_path = os.path.join(self.run['alignment_dir'],self.run['name'])
-
- #assert not os.path.exists(full_working_path)
- if not os.path.exists(full_working_path):
- os.mkdir(full_working_path)
-
- assert os.path.exists(full_working_path)
-
- # ATTENTION: Changing working directory
- os.chdir(full_working_path)
-
- self.logfh = open('_qpalma_train.log','w+')
- cPickle.dump(self.run,open('run_obj.pickle','w+'))
-
- self.plog("Settings are:\n")
- self.plog("%s\n"%str(self.run))
-
- if self.run['mode'] == 'normal':
- self.use_quality_scores = False
-
- elif self.run['mode'] == 'using_quality_scores':
- self.use_quality_scores = True
- else:
- assert(False)
-
-
- def setUpSolver(self):
- # Initialize solver
- self.plog('Initializing problem...\n')
-
- try:
- solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
- except:
- self.plog('Got no license. Telling queue to reschedule job...\n')
- sys.exit(99)
-
- solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
- solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
-
- return solver
-
-
- def train(self,training_set):
- """
- The mainloop for training.
- """
-
- numExamples = len(training_set)
- self.plog('Number of training examples: %d\n'% numExamples)
-
- self.noImprovementCtr = 0
- self.oldObjValue = 1e8
-
- iteration_steps = self.run['iter_steps']
- remove_duplicate_scores = self.run['remove_duplicate_scores']
- print_matrix = self.run['print_matrix']
- anzpath = self.run['anzpath']
-
- # Initialize parameter vector
- param = numpy.matlib.rand(run['numFeatures'],1)
-
- lengthSP = self.run['numLengthSuppPoints']
- donSP = self.run['numDonSuppPoints']
- accSP = self.run['numAccSuppPoints']
- mmatrixSP = self.run['matchmatrixRows']*run['matchmatrixCols']
- numq = self.run['numQualSuppPoints']
- totalQualSP = self.run['totalQualSuppPoints']
-
- # no intron length model
- if not self.run['enable_intron_length']:
- param[:lengthSP] *= 0.0
-
- # Set the parameters such as limits penalties for the Plifs
- [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
-
- solver = self.setUpSolver()
-
- # stores the number of alignments done for each example (best path, second-best path etc.)
- num_path = [anzpath]*numExamples
-
- # stores the gap for each example
- gap = [0.0]*numExamples
-
- currentPhi = zeros((run['numFeatures'],1))
- totalQualityPenalties = zeros((totalQualSP,1))
-
- numConstPerRound = run['numConstraintsPerRound']
- solver_call_ctr = 0
-
- suboptimal_example = 0
- iteration_nr = 0
- param_idx = 0
- const_added_ctr = 0
-
- featureVectors = zeros((run['numFeatures'],numExamples))
-
- self.plog('Starting training...\n')
- # the main training loop
- while True:
- if iteration_nr == iteration_steps:
- break
-
- for exampleIdx,example_key in enumerate(training_set.keys()):
- print 'Current example %d' % example_key
- try:
- dna,est,acc_supp,don_supp,exons,original_est,currentQualities =\
- getData(training_set,example_key,run)
- except SpliceSiteException:
- continue
-
- dna_len = len(dna)
-
- if run['enable_quality_scores']:
- quality = currentQualities[quality_index]
- else:
- quality = [40]*len(read)
-
- if not run['enable_splice_signals']:
- for idx,elem in enumerate(don_supp):
- if elem != -inf:
- don_supp[idx] = 0.0
-
- for idx,elem in enumerate(acc_supp):
- if elem != -inf:
- acc_supp[idx] = 0.0
-
- # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
- if run['mode'] == 'using_quality_scores':
- trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
- computeSpliceAlignWithQuality(dna, exons, est, original_est,\
- quality, qualityPlifs,run)
- else:
- trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-
- dna_calc = dna_calc.replace('-','')
-
- #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
- # Calculate the weights
- trueWeightDon, trueWeightAcc, trueWeightIntron =\
- computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
- trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-
- currentPhi[0:lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
- currentPhi[lengthSP:lengthSP+donSP] = mat(d.penalties[:]).reshape(donSP,1)
- currentPhi[lengthSP+donSP:lengthSP+donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
- currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP] = mmatrix[:]
-
- if run['mode'] == 'using_quality_scores':
- totalQualityPenalties = param[-totalQualSP:]
- currentPhi[lengthSP+donSP+accSP+mmatrixSP:] = totalQualityPenalties[:]
-
- # Calculate w'phi(x,y) the total score of the alignment
- trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
-
- # The allWeights vector is supposed to store the weight parameter
- # of the true alignment as well as the weight parameters of the
- # num_path[exampleIdx] other alignments
- allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
- allWeights[:,0] = trueWeight[:,0]
-
- AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
- AlignmentScores[0] = trueAlignmentScore
-
- ################## Calculate wrong alignment(s) ######################
- # Compute donor, acceptor with penalty_lookup_new
- # returns two double lists
- donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
- ps = h.convert2SWIG()
-
- newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
- newQualityPlifsFeatures, unneeded1, unneeded2 =\
- self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
- mm_len = run['matchmatrixRows']*run['matchmatrixCols']
-
- newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
- newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
-
- newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
- # Calculate weights of the respective alignments. Note that we are calculating n-best alignments without
- # hamming loss, so we have to keep track which of the n-best alignments correspond to the true one in order
- # not to incorporate a true alignment in the
- # constraints. To keep track of the true and false alignments we
- # define an array true_map with a boolean indicating the
- # equivalence to the true alignment for each decoded alignment.
- true_map = [0]*(num_path[exampleIdx]+1)
- true_map[0] = 1
-
- for pathNr in range(num_path[exampleIdx]):
- weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
- h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
- acc_supp)
-
- decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
- decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
- # Gewichte in restliche Zeilen der Matrix speichern
- allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
-
- hpen = mat(h.penalties).reshape(len(h.penalties),1)
- dpen = mat(d.penalties).reshape(len(d.penalties),1)
- apen = mat(a.penalties).reshape(len(a.penalties),1)
- features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
-
- featureVectors[:,exampleIdx] = allWeights[:,pathNr+1]
-
- AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
-
- distinct_scores = False
- if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
- distinct_scores = True
-
- # Check wether scalar product + loss equals viterbi score
- if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
- self.plog("Scalar prod. + loss not equals Viterbi output!\n")
- pdb.set_trace()
-
- self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
- self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
-
- # if the pathNr-best alignment is very close to the true alignment consider it as true
- if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
- true_map[pathNr+1] = 1
-
- if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
- print "suboptimal_example %d\n" %exampleIdx
- #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
- #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
-
- #pdb.set_trace()
- suboptimal_example += 1
- self.plog("suboptimal_example %d\n" %exampleIdx)
-
- # the true label sequence should not have a larger score than the maximal one WHYYYYY?
- # this means that all n-best paths are to close to each other
- # we have to extend the n-best search to a (n+1)-best
- if len([elem for elem in true_map if elem == 1]) == len(true_map):
- num_path[exampleIdx] = num_path[exampleIdx]+1
-
- # Choose true and first false alignment for extending
- firstFalseIdx = -1
- for map_idx,elem in enumerate(true_map):
- if elem == 0:
- firstFalseIdx = map_idx
- break
-
- if False:
- self.plog("Is considered as: %d\n" % true_map[1])
-
- #result_len = currentAlignment.getResultLength()
-
- dna_array,est_array = currentAlignment.getAlignmentArraysNew()
-
- _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
- _newEstAlign = newEstAlign[0].flatten().tolist()[0]
-
- # if there is at least one useful false alignment add the
- # corresponding constraints to the optimization problem
- if firstFalseIdx != -1:
- firstFalseWeights = allWeights[:,firstFalseIdx]
- differenceVector = trueWeight - firstFalseWeights
- #pdb.set_trace()
-
- const_added = solver.addConstraint(differenceVector, exampleIdx)
- const_added_ctr += 1
-
- # end of one example processing
-
- # call solver every nth example //added constraint
- if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
- objValue,w,self.slacks = solver.solve()
- solver_call_ctr += 1
-
- if solver_call_ctr == 5:
- numConstPerRound = 200
- self.plog('numConstPerRound is now %d\n'% numConstPerRound)
-
- if math.fabs(objValue - self.oldObjValue) <= 1e-6:
- self.noImprovementCtr += 1
-
- if self.noImprovementCtr == numExamples+1:
- break
-
- self.oldObjValue = objValue
- print "objValue is %f" % objValue
-
- sum_xis = 0
- for elem in self.slacks:
- sum_xis += elem
-
- print 'sum of slacks is %f'% sum_xis
- self.plog('sum of slacks is %f\n'% sum_xis)
-
- for i in range(len(param)):
- param[i] = w[i]
-
- cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
- param_idx += 1
- [h,d,a,mmatrix,qualityPlifs] =\
- set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
-
- ##############################################
- # end of one iteration through all examples #
- ##############################################
-
- self.plog("suboptimal rounds %d\n" %suboptimal_example)
-
- if self.noImprovementCtr == numExamples*2:
- FinalizeTraining(param,'param_%d.pickle'%param_idx)
-
- iteration_nr += 1
-
- #
- # end of optimization
- #
- FinalizeTraining(param,'param_%d.pickle'%param_idx)
-
-
- def FinalizeTraining(self,vector,name):
- self.plog("Training completed")
- cPickle.dump(param,open(name,'w+'))
- self.logfh.close()
-
-
-###############################################################################
-#
-# End of the code needed for training
-#
-# Begin of code for prediction
-#
-###############################################################################
-
-
- def init_prediction(self,dataset_fn,prediction_keys,param_fn,set_name):
- """
- Performing a prediction takes...
- """
- self.set_name = set_name
-
- #full_working_path = os.path.join(run['alignment_dir'],run['name'])
- full_working_path = self.run['result_dir']
-
- print 'full_working_path is %s' % full_working_path
-
- #assert not os.path.exists(full_working_path)
- if not os.path.exists(full_working_path):
- os.mkdir(full_working_path)
-
- assert os.path.exists(full_working_path)
-
- # ATTENTION: Changing working directory
- os.chdir(full_working_path)
-
- self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
-
- # number of prediction instances
- self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
-
- # load dataset and fetch instances that shall be predicted
- dataset = cPickle.load(open(dataset_fn))
-
- prediction_set = {}
- for key in prediction_keys:
- prediction_set[key] = dataset[key]
-
- # we do not need the full dataset anymore
- del dataset
-
- # Load parameter vector to predict with
- param = cPickle.load(open(param_fn))
-
- self.predict(prediction_set,param)
-
-
- def predict(self,prediction_set,param):
- """
- This method...
- """
-
- if self.run['mode'] == 'normal':
- self.use_quality_scores = False
-
- elif self.run['mode'] == 'using_quality_scores':
- self.use_quality_scores = True
- else:
- assert(False)
-
- # Set the parameters such as limits/penalties for the Plifs
- [h,d,a,mmatrix,qualityPlifs] =\
- set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
-
- if not self.qpalma_debug_mode:
- self.plog('Starting prediction...\n')
-
- self.problem_ctr = 0
-
- # where we store the predictions
- allPredictions = []
-
- # we take the first quality vector of the tuple of quality vectors
- quality_index = 0
-
- # beginning of the prediction loop
- for example_key in prediction_set.keys():
- print 'Current example %d' % example_key
- for example in prediction_set[example_key]:
-
- currentSeqInfo,read,currentQualities = example
-
- id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
- currentSeqInfo
-
- if not self.qpalma_debug_mode:
- self.plog('Loading example id: %d...\n'% int(id))
-
- if self.run['enable_quality_scores']:
- quality = currentQualities[quality_index]
- else:
- quality = [40]*len(read)
-
- try:
- currentDNASeq, currentAcc, currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
- except:
- self.problem_ctr += 1
- continue
-
- if not self.run['enable_splice_signals']:
- for idx,elem in enumerate(currentDon):
- if elem != -inf:
- currentDon[idx] = 0.0
-
- for idx,elem in enumerate(currentAcc):
- if elem != -inf:
- currentAcc[idx] = 0.0
-
- current_prediction = self.calc_alignment(currentDNASeq, read,\
- quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
-
- current_prediction['id'] = id
- current_prediction['chr'] = chromo
- current_prediction['strand'] = strand
- current_prediction['start_pos'] = genomicSeq_start
-
- allPredictions.append(current_prediction)
-
- if not self.qpalma_debug_mode:
- self.FinalizePrediction(allPredictions)
- else:
- return allPredictions
-
-
- def FinalizePrediction(self,allPredictions):
- """ End of the prediction loop we save all predictions in a pickle file and exit """
-
- cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
- self.plog('Prediction completed\n')
- mes = 'Problem ctr %d' % self.problem_ctr
- self.plog(mes+'\n')
- self.logfh.close()
-
-
- def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
- """
- Given two sequences and the parameters we calculate on alignment
- """
-
- run = self.run
- donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
- if '-' in read:
- self.plog('found gap\n')
- read = read.replace('-','')
- assert len(read) == Conf.read_size
-
- ps = h.convert2SWIG()
-
- newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
- newQualityPlifsFeatures, dna_array, read_array =\
- self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
-
- mm_len = run['matchmatrixRows']*run['matchmatrixCols']
-
- true_map = [0]*2
- true_map[0] = 1
- pathNr = 0
-
- _newSpliceAlign = array.array('B',newSpliceAlign)
- _newEstAlign = array.array('B',newEstAlign)
-
- #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
- alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array)
-
- dna_array = array.array('B',dna_array)
- read_array = array.array('B',read_array)
-
- newExons = self.calculatePredictedExons(newSpliceAlign)
-
- current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
- 'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
- 'dna_array':dna_array, 'read_array':read_array }
-
- return current_prediction
-
-
- def calculatePredictedExons(self,SpliceAlign):
- newExons = []
- oldElem = -1
- SpliceAlign.append(-1)
- for pos,elem in enumerate(SpliceAlign):
- if pos == 0:
- oldElem = -1
- else:
- oldElem = SpliceAlign[pos-1]
-
- if oldElem != 0 and elem == 0: # start of exon
- newExons.append(pos)
-
- if oldElem == 0 and elem != 0: # end of exon
- newExons.append(pos)
-
- return newExons
+++ /dev/null
-#!/bin/bash
-
-instance=$1
-
-source $HOME/.bashrc
-
-#python -c "import sys; from cPickle import *; ma=load(open('lmm_${instance}.pickle'))
-#
-#try:
-# ma.run()
-#except:
-# sys.exit(99)"
-
-python -c "import sys; from cPickle import *; ma=load(open('${instance}')); ma.train()"