+ cleaned up directories
authorFabio <fabio@congo.fml.local>
Tue, 7 Oct 2008 09:55:20 +0000 (11:55 +0200)
committerFabio <fabio@congo.fml.local>
Tue, 7 Oct 2008 09:55:20 +0000 (11:55 +0200)
+ moved some files to the qpalma package

26 files changed:
doc/qpalma.tex
qpalma/PipelineHeuristic.py [new file with mode: 0644]
qpalma/Run.py [new file with mode: 0644]
qpalma/qpalma_main.py [new file with mode: 0644]
scripts/Evaluation.py [deleted file]
scripts/Experiment.py [deleted file]
scripts/ModelSelection.py [deleted file]
scripts/PipelineHeuristic.py [deleted file]
scripts/QPalmaPipeline.py [deleted file]
scripts/Run.py [deleted file]
scripts/__init__.py [deleted file]
scripts/check_and_init.py [deleted file]
scripts/check_dataset.py [deleted file]
scripts/createAlignmentFileFromPrediction.py [deleted file]
scripts/createCoverageFigures.sh [deleted file]
scripts/debugDataset.py [deleted file]
scripts/est2gff.sh [deleted file]
scripts/est2gff_pipeline.sh [deleted file]
scripts/evaluateVmatch.py [deleted file]
scripts/grid_alignment.py [deleted file]
scripts/grid_heuristic.py [deleted file]
scripts/grid_predict.py [deleted file]
scripts/plot_error.py [deleted file]
scripts/predictAll.sh [deleted file]
scripts/qpalma_main.py [deleted file]
scripts/resurrect [deleted file]

index d264a06..5e1532e 100644 (file)
@@ -25,8 +25,7 @@ alignments that uses the base quality information of the reads directly in the
 alignment step. Optimal alignment parameters i.e. scoring matrices are inferred
 using a machine learning technique similar to \emph{Support Vector Machines}.
 For further details on \QP itself consult the paper \cite{DeBona08}.  For
-details about the learning method \cite{Tsochantaridis04}.
-%$SOLid$. 
+details about the learning method see \cite{Tsochantaridis04}.
 %We refer to the whole pipeline as the \QP pipeline and \QP respectively.
 
 \section{Quicktour}
@@ -222,7 +221,6 @@ Dependencies so far
 - pythongrid
 - Genefinding doIntervalQuery
 
-
 \begin{thebibliography}{1}
 
 \bibitem[1]{DeBona08} 
diff --git a/qpalma/PipelineHeuristic.py b/qpalma/PipelineHeuristic.py
new file mode 100644 (file)
index 0000000..3fd1418
--- /dev/null
@@ -0,0 +1,618 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Gunnar Rätsch, Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
+import cPickle
+import sys
+import pdb
+import os
+import os.path
+import resource
+
+from qpalma.computeSpliceWeights import *
+from qpalma.set_param_palma import *
+from qpalma.computeSpliceAlignWithQuality import *
+
+from numpy.matlib import mat,zeros,ones,inf
+from numpy import inf,mean
+
+from ParaParser import ParaParser,IN_VECTOR
+
+from qpalma.Lookup import LookupTable
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement,unbracket_seq
+
+
+def cpu():
+   return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
+   resource.getrusage(resource.RUSAGE_SELF).ru_stime) 
+
+
+class BracketWrapper:
+   fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
+
+   def __init__(self,filename):
+      self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
+      self.parser.parseFile(filename)
+
+   def __len__(self):
+      return self.parser.getSize(IN_VECTOR)
+      
+   def __getitem__(self,key):
+      return self.parser.fetchEntry(key)
+
+   def __iter__(self):
+      self.counter = 0
+      self.size = self.parser.getSize(IN_VECTOR)
+      return self
+
+   def next(self):
+      if not self.counter < self.size:
+         raise StopIteration
+      return_val = self.parser.fetchEntry(self.counter)
+      self.counter += 1
+      return return_val
+
+
+class PipelineHeuristic:
+   """
+   This class wraps the filter which decides whether an alignment found by
+   vmatch is spliced an should be then newly aligned using QPalma or not.
+   """
+
+   def __init__(self,run_fname,data_fname,param_fname,result_fname,settings):
+      """
+      We need a run object holding information about the nr. of support points
+      etc.
+      """
+
+      run = cPickle.load(open(run_fname))
+      self.run = run
+
+      start = cpu()
+      self.all_remapped_reads = BracketWrapper(data_fname)
+      stop = cpu()
+
+      print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
+
+      self.chromo_range = settings['allowed_fragments']
+
+      accessWrapper = DataAccessWrapper(settings)
+      seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+
+      start = cpu()
+      self.lt1 = LookupTable(settings)
+      stop = cpu()
+
+      print 'prefetched sequence and splice data in %f sec' % (stop-start)
+
+      self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
+      self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
+
+      start = cpu()
+
+      self.data_fname = data_fname
+
+      self.param = cPickle.load(open(param_fname))
+      
+      # Set the parameters such as limits penalties for the Plifs
+      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(self.param,True,run)
+
+      self.h = h
+      self.d = d
+      self.a = a
+      self.mmatrix = mmatrix
+      self.qualityPlifs = qualityPlifs
+
+      self.read_size    = 38
+      self.prb_offset   = 64
+      #self.prb_offset   = 50
+
+      # parameters of the heuristics to decide whether the read is spliced
+      #self.splice_thresh      = 0.005
+      self.splice_thresh      = 0.01
+      self.max_intron_size    = 2000
+      self.max_mismatch       = 2 
+      #self.splice_stop_thresh = 0.99
+      self.splice_stop_thresh = 0.8
+      self.spliced_bias       = 0.0
+
+      param_lines = [\
+      ('%f','splice_thresh',self.splice_thresh),\
+      ('%d','max_intron_size',self.max_intron_size),\
+      ('%d','max_mismatch',self.max_mismatch),\
+      ('%f','splice_stop_thresh',self.splice_stop_thresh),\
+      ('%f','spliced_bias',self.spliced_bias)]
+
+      param_lines = [('# %s: '+form+'\n')%(name,val) for form,name,val in param_lines]
+      param_lines.append('# data: %s\n'%self.data_fname)
+      param_lines.append('# param: %s\n'%param_fname)
+
+      #pdb.set_trace()
+
+      for p_line in param_lines:
+         self.result_spliced_fh.write(p_line)
+         self.result_unspliced_fh.write(p_line)
+   
+      #self.original_reads = {}
+
+      # we do not have this information
+      #for line in open(reads_pipeline_fn):
+      #   line = line.strip()
+      #   id,seq,q1,q2,q3 = line.split()
+      #   id = int(id)
+      #   self.original_reads[id] = seq
+
+      lengthSP    = run['numLengthSuppPoints']
+      donSP       = run['numDonSuppPoints']
+      accSP       = run['numAccSuppPoints']
+      mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
+      numq        = run['numQualSuppPoints']
+      totalQualSP = run['totalQualSuppPoints']
+
+      currentPhi = zeros((run['numFeatures'],1))
+      currentPhi[0:lengthSP]                                            = mat(h.penalties[:]).reshape(lengthSP,1)
+      currentPhi[lengthSP:lengthSP+donSP]                               = mat(d.penalties[:]).reshape(donSP,1)
+      currentPhi[lengthSP+donSP:lengthSP+donSP+accSP]                   = mat(a.penalties[:]).reshape(accSP,1)
+      currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP]   = mmatrix[:]
+
+      totalQualityPenalties = self.param[-totalQualSP:]
+      currentPhi[lengthSP+donSP+accSP+mmatrixSP:]                    = totalQualityPenalties[:]
+      self.currentPhi = currentPhi
+
+      # we want to identify spliced reads 
+      # so true pos are spliced reads that are predicted "spliced"
+      self.true_pos  = 0
+      
+      # as false positives we count all reads that are not spliced but predicted
+      # as "spliced"
+      self.false_pos = 0
+
+      self.true_neg  = 0
+      self.false_neg = 0
+
+      # total time spend for get seq and scores
+      self.get_time  = 0.0
+      self.calcAlignmentScoreTime = 0.0
+      self.alternativeScoresTime = 0.0
+
+      self.count_time = 0.0
+      self.main_loop = 0.0
+      self.splice_site_time = 0.0
+      self.computeSpliceAlignWithQualityTime = 0.0
+      self.computeSpliceWeightsTime = 0.0
+      self.DotProdTime = 0.0
+      self.array_stuff = 0.0
+      stop = cpu()
+
+      self.init_time = stop-start
+
+   def filter(self):
+      """
+      This method...
+      """
+      run = self.run 
+
+      ctr = 0
+      unspliced_ctr  = 0
+      spliced_ctr    = 0
+
+      print 'Starting filtering...'
+      _start = cpu()
+
+      for location,original_line in self.all_remapped_reads:
+         
+         if ctr % 1000 == 0:
+            print ctr
+
+         id       = location['id']
+         chr      = location['chr']
+         pos      = location['pos']
+         strand   = location['strand']
+         seq      = location['seq']
+         prb      = location['prb']
+
+         id = int(id)
+
+         seq = seq.lower()
+
+         strand_map = {'D':'+', 'P':'-'}
+
+         strand = strand_map[strand]
+
+         if not chr in self.chromo_range:
+            continue
+
+         unb_seq = unbracket_seq(seq)
+
+         # forgot to do this
+         if strand == '-':
+            unb_seq = reverse_complement(unb_seq)
+            seq = reverse_complement(seq)
+
+         effective_len = len(unb_seq)
+
+         genomicSeq_start  = pos
+         #genomicSeq_stop   = pos+effective_len-1
+         genomicSeq_stop   = pos+effective_len
+
+         start = cpu()
+         #currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
+         currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop)
+
+         # just performed some tests to check LookupTable results
+         #assert currentDNASeq_ == currentDNASeq
+         #assert currentAcc_ == currentAcc_
+         #assert currentDon_ == currentDon_
+
+         stop = cpu()
+         self.get_time += stop-start
+
+         dna            = currentDNASeq
+         exons          = zeros((2,1))
+         exons[0,0]     = 0
+         exons[1,0]     = effective_len
+         est            = unb_seq
+         original_est   = seq
+         quality        = map(lambda x:ord(x)-self.prb_offset,prb)
+
+         currentVMatchAlignment = dna, exons, est, original_est, quality,\
+         currentAcc, currentDon
+
+         #pdb.set_trace()
+
+         #alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+
+         try:
+            alternativeAlignmentScores = self.calcAlternativeAlignments(location)
+         except:
+            alternativeAlignmentScores = []
+
+         if alternativeAlignmentScores == []:
+             # no alignment necessary
+             maxAlternativeAlignmentScore = -inf
+             vMatchScore = 0.0 
+         else:
+             maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
+             # compute alignment for vmatch unspliced read
+             vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
+             #print 'vMatchScore/alternativeScore: %f %f %s' % (vMatchScore,maxAlternativeAlignmentScore,strand)
+             
+
+         start = cpu()
+         
+         #print 'all candidates %s' % str(alternativeAlignmentScores)
+
+         new_id = id - 1000000300000
+
+         unspliced = False
+         # unspliced 
+         if new_id > 0: 
+            unspliced = True
+
+         # Seems that according to our learned parameters VMatch found a good
+         # alignment of the current read
+         if maxAlternativeAlignmentScore < vMatchScore:
+            unspliced_ctr += 1
+
+            self.result_unspliced_fh.write(original_line+'\n') 
+
+            if unspliced:
+               self.true_neg += 1
+            else:
+               self.false_neg += 1
+
+         # We found an alternative alignment considering splice sites that scores
+         # higher than the VMatch alignment
+         else:
+            spliced_ctr += 1
+
+            self.result_spliced_fh.write(original_line+'\n')
+
+            if unspliced:
+               self.false_pos += 1
+            else:
+               self.true_pos += 1
+
+         ctr += 1
+         stop = cpu()
+         self.count_time = stop-start
+
+      _stop = cpu()
+      self.main_loop = _stop-_start
+
+      print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
+      print 'True pos / false pos : %d %d'%(self.true_pos,self.false_pos)
+      print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)
+
+
+   def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):
+
+      def signum(a):
+          if a>0: 
+              return 1
+         elif a<0:
+              return -1
+          else:
+              return 0
+
+      proximal_acc   = []
+      for idx in xrange(max_intron_size, max_intron_size+read_size/2):
+          if currentAcc[idx]>= splice_thresh:
+            proximal_acc.append((idx,currentAcc[idx]))
+
+      proximal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
+      proximal_acc=proximal_acc[-2:]
+
+      distal_acc   = []
+      for idx in xrange(max_intron_size+read_size, len(currentAcc)):
+          if currentAcc[idx]>= splice_thresh and idx+read_size<len(currentAcc):
+            distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
+
+      #distal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
+      #distal_acc=distal_acc[-2:]
+
+      proximal_don   = []
+      for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
+         if currentDon[idx] >= splice_thresh:
+            proximal_don.append((idx, currentDon[idx]))
+
+      proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
+      proximal_don=proximal_don[-2:]
+
+      distal_don   = []
+      for idx in xrange(1, max_intron_size):
+         if currentDon[idx] >= splice_thresh and idx>read_size:
+            distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
+
+      distal_don.sort(lambda x,y: y[0]-x[0])
+      #distal_don=distal_don[-2:]
+
+      return proximal_acc,proximal_don,distal_acc,distal_don
+
+
+   def calcAlternativeAlignments(self,location):
+      """
+      Given an alignment proposed by Vmatch this function calculates possible
+      alternative alignments taking into account for example matched
+      donor/acceptor positions.
+      """
+
+      run = self.run
+
+      id       = location['id']
+      chr      = location['chr']
+      pos      = location['pos']
+      strand   = location['strand']
+      original_est = location['seq']
+      quality      = location['prb']
+      quality        = map(lambda x:ord(x)-self.prb_offset,quality)
+      
+      original_est = original_est.lower()
+      est = unbracket_seq(original_est)
+      effective_len = len(est)
+
+      genomicSeq_start  = pos - self.max_intron_size
+      genomicSeq_stop   = pos + self.max_intron_size + len(est)
+
+      strand_map = {'D':'+', 'P':'-'}
+      strand = strand_map[strand]
+
+      if strand == '-':
+         est = reverse_complement(est)
+         original_est = reverse_complement(original_est)
+
+      start = cpu()
+      #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
+      currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop)
+      stop = cpu()
+      self.get_time += stop-start
+      dna   = currentDNASeq
+
+      proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
+       
+      alternativeScores = []
+
+      #pdb.set_trace()
+      
+      # inlined
+      h = self.h
+      d = self.d
+      a = self.a
+      mmatrix = self.mmatrix
+      qualityPlifs = self.qualityPlifs
+      # inlined
+
+      # find an intron on the 3' end
+      _start = cpu()
+      for (don_pos,don_score) in proximal_don:
+          DonorScore = calculatePlif(d, [don_score])[0]
+
+          for (acc_pos,acc_score,acc_dna) in distal_acc:
+
+              IntronScore = calculatePlif(h, [acc_pos-don_pos])[0] 
+              AcceptorScore = calculatePlif(a, [acc_score])[0] 
+          
+              #print 'don splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
+
+              # construct a new "original_est"
+              original_est_cut=''
+              
+              est_ptr=0
+              dna_ptr=self.max_intron_size
+              ptr=0
+              acc_dna_ptr=0
+              num_mismatch = 0
+              
+              while ptr<len(original_est):
+                  #print acc_dna_ptr,len(acc_dna),acc_pos,don_pos
+                  
+                  if original_est[ptr]=='[':
+                      dnaletter=original_est[ptr+1]
+                      estletter=original_est[ptr+2]
+                      if dna_ptr < don_pos:
+                          original_est_cut+=original_est[ptr:ptr+4]
+                          num_mismatch += 1
+                      else:
+                          if acc_dna[acc_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
+                              num_mismatch += 1
+                              #print '['+acc_dna[acc_dna_ptr]+estletter+']' 
+                          acc_dna_ptr+=1 
+                      ptr+=4 
+                  else:
+                      dnaletter=original_est[ptr]
+                      estletter=dnaletter
+                
+                      if dna_ptr < don_pos:
+                          original_est_cut+=estletter # EST letter
+                      else:
+                          if acc_dna[acc_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              num_mismatch += 1
+                              original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
+                              #print '('+acc_dna[acc_dna_ptr]+estletter+')' 
+                          acc_dna_ptr+=1 
+                          
+                      ptr+=1
+
+                  if estletter=='-':
+                      dna_ptr+=1 
+                  elif dnaletter=='-':
+                      est_ptr+=1
+                  else:
+                      dna_ptr+=1 
+                      est_ptr+=1
+              if num_mismatch>self.max_mismatch:
+                  continue
+                         
+              assert(dna_ptr<=len(dna))
+              assert(est_ptr<=len(est))
+
+              #print original_est, original_est_cut              
+
+              score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
+              score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias 
+         
+              alternativeScores.append(score)
+
+              if acc_score>=self.splice_stop_thresh:
+                  break
+              
+      #pdb.set_trace()
+
+      _stop = cpu()
+      self.alternativeScoresTime += _stop-_start
+
+      # find an intron on the 5' end 
+      _start = cpu()
+      for (acc_pos,acc_score) in proximal_acc:
+
+          AcceptorScore = calculatePlif(a, [acc_score])[0]
+          
+          for (don_pos,don_score,don_dna) in distal_don:
+
+              DonorScore = calculatePlif(d, [don_score])[0]
+              IntronScore = calculatePlif(h, [acc_pos-don_pos])[0] 
+          
+              #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
+
+              # construct a new "original_est"
+              original_est_cut=''
+              
+              est_ptr=0
+              dna_ptr=self.max_intron_size
+              ptr=0
+              num_mismatch = 0
+              don_dna_ptr=len(don_dna)-(acc_pos-self.max_intron_size)-1
+              while ptr<len(original_est):
+                  
+                  if original_est[ptr]=='[':
+                      dnaletter=original_est[ptr+1]
+                      estletter=original_est[ptr+2]
+                      if dna_ptr > acc_pos:
+                          original_est_cut+=original_est[ptr:ptr+4] 
+                          num_mismatch += 1
+                      else:
+                          if don_dna[don_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
+                              num_mismatch += 1
+                              #print '['+don_dna[don_dna_ptr]+estletter+']' 
+                          don_dna_ptr+=1 
+                      ptr+=4 
+                  else:
+                      dnaletter=original_est[ptr]
+                      estletter=dnaletter
+                
+                      if dna_ptr > acc_pos:
+                          original_est_cut+=estletter # EST letter
+                      else:
+                          if don_dna[don_dna_ptr]==estletter:
+                              original_est_cut += estletter # EST letter
+                          else:
+                              original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
+                              num_mismatch += 1
+                              #print '('+don_dna[don_dna_ptr]+estletter+')' 
+                          don_dna_ptr+=1 
+                          
+                      ptr+=1
+
+                  if estletter=='-':
+                      dna_ptr+=1 
+                  elif dnaletter=='-':
+                      est_ptr+=1
+                  else:
+                      dna_ptr+=1 
+                      est_ptr+=1
+                         
+              if num_mismatch>self.max_mismatch:
+                  continue
+              assert(dna_ptr<=len(dna))
+              assert(est_ptr<=len(est))
+
+              #print original_est, original_est_cut              
+
+              score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
+              score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
+         
+              alternativeScores.append(score)
+
+              if don_score>=self.splice_stop_thresh:
+                  break
+              
+      _stop = cpu()
+      self.alternativeScoresTime += _stop-_start
+
+      return alternativeScores
+
+
+   def calcAlignmentScore(self,alignment):
+      """
+      Given an alignment (dna,exons,est) and the current parameter for QPalma
+      this function calculates the dot product of the feature representation of
+      the alignment and the parameter vector i.e the alignment score. 
+      """
+
+      start = cpu()
+      run = self.run
+
+      # Lets start calculation
+      dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
+
+      score = computeSpliceAlignScoreWithQuality(original_est, quality, self.qualityPlifs, run, self.currentPhi)
+
+      stop = cpu()
+      self.calcAlignmentScoreTime += stop-start
+      
+      return score 
diff --git a/qpalma/Run.py b/qpalma/Run.py
new file mode 100644 (file)
index 0000000..a9755bd
--- /dev/null
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# -*- coding:latin-1 -*-
+
+
+class Run(dict):
+   """
+   This class represents the needed parameters for one single run of an
+   algorithm. This means that if we for example perform a model selection over
+   C then for each value of C we have to create one Run object storing the
+   respective parameters.
+   """
+
+   def __init__(self):
+      pass
+
+   def __repr__(self):
+      
+      result = ""
+      for key,val in self.iteritems():
+         result += "%s:\t\t %s\n" % (key,str(val))
+
+      return result
+
+if __name__ == '__main__':
+   r1 = Run()
+   r1['attrib_1'] = 12
+   r1['attrib_2'] = 22
+
+   print "%s" % r1
+
diff --git a/qpalma/qpalma_main.py b/qpalma/qpalma_main.py
new file mode 100644 (file)
index 0000000..b3349be
--- /dev/null
@@ -0,0 +1,668 @@
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
+import array
+import cPickle
+import os.path
+import pdb
+import sys
+
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
+
+import numpy
+from numpy.matlib import mat,zeros,ones,inf
+from numpy.linalg import norm
+
+#from qpalma.SIQP_CPX import SIQPSolver
+#from qpalma.SIQP_CVXOPT import SIQPSolver
+
+import QPalmaDP
+import qpalma
+from qpalma.computeSpliceWeights import *
+from qpalma.set_param_palma import *
+from qpalma.computeSpliceAlignWithQuality import *
+from qpalma.TrainingParam import Param
+from qpalma.Plif import Plf,compute_donacc
+
+from qpalma.utils import pprint_alignment, get_alignment
+
+class SpliceSiteException:
+   pass
+
+
+def getData(training_set,exampleKey,run):
+   """ This function...  """
+
+   currentSeqInfo,original_est,currentQualities,currentExons = training_set[exampleKey]
+   id,chr,strand,up_cut,down_cut = currentSeqInfo
+
+   est = original_est
+   est = "".join(est)
+   est = est.lower()
+   est = unbracket_est(est)
+   est = est.replace('-','')
+
+   assert len(est) == run['read_size'], pdb.set_trace()
+   est_len = len(est)
+
+   #original_est = OriginalEsts[exampleIdx]
+   original_est = "".join(original_est)
+   original_est = original_est.lower()
+
+   dna_flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
+
+   #currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
+
+   original_exons = currentExons
+   exons = original_exons - (up_cut-1)
+   exons[0,0] -= 1
+   exons[1,0] -= 1
+
+   if exons.shape == (2,2):
+      fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
+     
+      donor_elem = dna[exons[0,1]:exons[0,1]+2]
+      acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
+
+      if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
+         print 'invalid donor in example %d'% exampleKey
+         raise SpliceSiteException
+
+      if not ( acceptor_elem == 'ag' ):
+         print 'invalid acceptor in example %d'% exampleKey
+         raise SpliceSiteException
+
+      assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
+
+   return dna,est,acc_supp,don_supp,exons,original_est,currentQualities
+
+
+class QPalma:
+   """
+   This class wraps the training and prediction functions for 
+   the alignment.
+   """
+   
+   def __init__(self,run,seqInfo,dmode=False):
+      self.ARGS = Param()
+      self.qpalma_debug_mode = dmode
+      self.run = run
+      self.seqInfo = seqInfo
+
+
+   def plog(self,string):
+      self.logfh.write(string)
+      self.logfh.flush()
+
+
+   def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
+      """
+      Given the needed input this method calls the QPalma C module which
+      calculates a dynamic programming in order to obtain an alignment
+      """
+
+      dna_len = len(dna)
+      est_len = len(est)
+
+      prb = QPalmaDP.createDoubleArrayFromList(quality)
+      chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
+
+      matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
+      mm_len = self.run['matchmatrixRows']*self.run['matchmatrixCols']
+
+      d_len = len(donor)
+      donor = QPalmaDP.createDoubleArrayFromList(donor)
+      a_len = len(acceptor)
+      acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
+
+      # Create the alignment object representing the interface to the C/C++ code.
+      currentAlignment = QPalmaDP.Alignment(self.run['numQualPlifs'],self.run['numQualSuppPoints'], self.use_quality_scores)
+      c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
+      # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
+      currentAlignment.myalign( current_num_path, dna, dna_len,\
+       est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
+       acceptor, a_len, c_qualityPlifs, self.run['remove_duplicate_scores'],\
+       self.run['print_matrix'] )
+
+      if prediction_mode:
+         # part that is only needed for prediction
+         result_len = currentAlignment.getResultLength()
+         dna_array,est_array = currentAlignment.getAlignmentArraysNew()
+      else:
+         dna_array = None
+         est_array = None
+
+      newSpliceAlign, newEstAlign, newWeightMatch, newDPScores, newQualityPlifsFeatures =\
+      currentAlignment.getAlignmentResultsNew()
+
+      return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
+      newQualityPlifsFeatures, dna_array, est_array
+
+
+   def init_train(self,training_set):
+      full_working_path = os.path.join(self.run['alignment_dir'],self.run['name'])
+
+      #assert not os.path.exists(full_working_path)
+      if not os.path.exists(full_working_path):
+         os.mkdir(full_working_path)
+
+      assert os.path.exists(full_working_path)
+
+      # ATTENTION: Changing working directory
+      os.chdir(full_working_path)
+
+      self.logfh = open('_qpalma_train.log','w+')
+      cPickle.dump(self.run,open('run_obj.pickle','w+'))
+
+      self.plog("Settings are:\n")
+      self.plog("%s\n"%str(self.run))
+
+      if self.run['mode'] == 'normal':
+         self.use_quality_scores = False
+
+      elif self.run['mode'] == 'using_quality_scores':
+         self.use_quality_scores = True
+      else:
+         assert(False)
+
+
+   def setUpSolver(self):
+      # Initialize solver 
+      self.plog('Initializing problem...\n')
+      
+      try:
+         solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
+      except:
+         self.plog('Got no license. Telling queue to reschedule job...\n')
+         sys.exit(99)
+
+      solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
+      solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
+
+      return solver
+
+
+   def train(self,training_set):
+      """
+      The mainloop for training.
+      """
+
+      numExamples = len(training_set)
+      self.plog('Number of training examples: %d\n'% numExamples)
+
+      self.noImprovementCtr = 0
+      self.oldObjValue = 1e8
+
+      iteration_steps         = self.run['iter_steps']
+      remove_duplicate_scores = self.run['remove_duplicate_scores']
+      print_matrix            = self.run['print_matrix']
+      anzpath                 = self.run['anzpath']
+
+      # Initialize parameter vector
+      param = numpy.matlib.rand(run['numFeatures'],1)
+   
+      lengthSP    = self.run['numLengthSuppPoints']
+      donSP       = self.run['numDonSuppPoints']
+      accSP       = self.run['numAccSuppPoints']
+      mmatrixSP   = self.run['matchmatrixRows']*run['matchmatrixCols']
+      numq        = self.run['numQualSuppPoints']
+      totalQualSP = self.run['totalQualSuppPoints']
+
+      # no intron length model
+      if not self.run['enable_intron_length']:
+         param[:lengthSP] *= 0.0
+
+      # Set the parameters such as limits penalties for the Plifs
+      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
+
+      solver = self.setUpSolver()
+
+      # stores the number of alignments done for each example (best path, second-best path etc.)
+      num_path = [anzpath]*numExamples
+
+      # stores the gap for each example
+      gap      = [0.0]*numExamples
+
+      currentPhi = zeros((run['numFeatures'],1))
+      totalQualityPenalties = zeros((totalQualSP,1))
+
+      numConstPerRound = run['numConstraintsPerRound']
+      solver_call_ctr = 0
+
+      suboptimal_example = 0
+      iteration_nr = 0
+      param_idx = 0
+      const_added_ctr = 0
+
+      featureVectors = zeros((run['numFeatures'],numExamples))
+
+      self.plog('Starting training...\n')
+      # the main training loop
+      while True:
+         if iteration_nr == iteration_steps:
+            break
+
+         for exampleIdx,example_key in enumerate(training_set.keys()):
+            print 'Current example %d' % example_key
+            try:
+               dna,est,acc_supp,don_supp,exons,original_est,currentQualities =\
+               getData(training_set,example_key,run)
+            except SpliceSiteException:
+               continue
+
+            dna_len = len(dna)
+
+            if run['enable_quality_scores']:
+               quality = currentQualities[quality_index]
+            else:
+               quality = [40]*len(read)
+
+            if not run['enable_splice_signals']:
+               for idx,elem in enumerate(don_supp):
+                  if elem != -inf:
+                     don_supp[idx] = 0.0
+
+               for idx,elem in enumerate(acc_supp):
+                  if elem != -inf:
+                     acc_supp[idx] = 0.0
+
+            # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
+            if run['mode'] == 'using_quality_scores':
+               trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
+               computeSpliceAlignWithQuality(dna, exons, est, original_est,\
+               quality, qualityPlifs,run)
+            else:
+               trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
+
+            dna_calc = dna_calc.replace('-','')
+
+            #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
+            # Calculate the weights
+            trueWeightDon, trueWeightAcc, trueWeightIntron =\
+            computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
+            trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+
+            currentPhi[0:lengthSP]                                            = mat(h.penalties[:]).reshape(lengthSP,1)
+            currentPhi[lengthSP:lengthSP+donSP]                               = mat(d.penalties[:]).reshape(donSP,1)
+            currentPhi[lengthSP+donSP:lengthSP+donSP+accSP]                   = mat(a.penalties[:]).reshape(accSP,1)
+            currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP]   = mmatrix[:]
+
+            if run['mode'] == 'using_quality_scores':
+               totalQualityPenalties = param[-totalQualSP:]
+               currentPhi[lengthSP+donSP+accSP+mmatrixSP:]                    = totalQualityPenalties[:]
+
+            # Calculate w'phi(x,y) the total score of the alignment
+            trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
+
+            # The allWeights vector is supposed to store the weight parameter
+            # of the true alignment as well as the weight parameters of the
+            # num_path[exampleIdx] other alignments
+            allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
+            allWeights[:,0] = trueWeight[:,0]
+
+            AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
+            AlignmentScores[0] = trueAlignmentScore
+
+            ################## Calculate wrong alignment(s) ######################
+            # Compute donor, acceptor with penalty_lookup_new
+            # returns two double lists
+            donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+            ps = h.convert2SWIG()
+
+            newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
+            newQualityPlifsFeatures, unneeded1, unneeded2 =\
+            self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
+            mm_len = run['matchmatrixRows']*run['matchmatrixCols']
+
+            newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
+            newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
+
+            newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
+            # Calculate weights of the respective alignments. Note that we are calculating n-best alignments without 
+            # hamming loss, so we have to keep track which of the n-best alignments correspond to the true one in order 
+            # not to incorporate a true alignment in the
+            # constraints. To keep track of the true and false alignments we
+            # define an array true_map with a boolean indicating the
+            # equivalence to the true alignment for each decoded alignment.
+            true_map = [0]*(num_path[exampleIdx]+1)
+            true_map[0] = 1
+
+            for pathNr in range(num_path[exampleIdx]):
+               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
+               h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
+               acc_supp)
+              
+               decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
+               decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
+               # Gewichte in restliche Zeilen der Matrix speichern
+               allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
+
+               hpen = mat(h.penalties).reshape(len(h.penalties),1)
+               dpen = mat(d.penalties).reshape(len(d.penalties),1)
+               apen = mat(a.penalties).reshape(len(a.penalties),1)
+               features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
+
+               featureVectors[:,exampleIdx] = allWeights[:,pathNr+1]
+
+               AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
+
+               distinct_scores = False
+               if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
+                  distinct_scores = True
+
+               # Check wether scalar product + loss equals viterbi score
+               if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
+                  self.plog("Scalar prod. + loss not equals Viterbi output!\n")
+                  pdb.set_trace()
+
+               self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
+               self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
+
+               # if the pathNr-best alignment is very close to the true alignment consider it as true
+               if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
+                  true_map[pathNr+1] = 1
+
+               if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
+                  print "suboptimal_example %d\n" %exampleIdx
+                  #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
+                  #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
+
+                  #pdb.set_trace()
+                  suboptimal_example += 1
+                  self.plog("suboptimal_example %d\n" %exampleIdx)
+
+               # the true label sequence should not have a larger score than the maximal one WHYYYYY?
+               # this means that all n-best paths are to close to each other 
+               # we have to extend the n-best search to a (n+1)-best
+               if len([elem for elem in true_map if elem == 1]) == len(true_map):
+                  num_path[exampleIdx] = num_path[exampleIdx]+1
+
+               # Choose true and first false alignment for extending
+               firstFalseIdx = -1
+               for map_idx,elem in enumerate(true_map):
+                  if elem == 0:
+                     firstFalseIdx = map_idx
+                     break
+
+               if False:
+                  self.plog("Is considered as: %d\n" % true_map[1])
+
+                  #result_len = currentAlignment.getResultLength()
+
+                  dna_array,est_array = currentAlignment.getAlignmentArraysNew()
+
+                  _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
+                  _newEstAlign = newEstAlign[0].flatten().tolist()[0]
+
+               # if there is at least one useful false alignment add the
+               # corresponding constraints to the optimization problem
+               if firstFalseIdx != -1:
+                  firstFalseWeights = allWeights[:,firstFalseIdx]
+                  differenceVector  = trueWeight - firstFalseWeights
+                  #pdb.set_trace()
+
+                  const_added = solver.addConstraint(differenceVector, exampleIdx)
+                  const_added_ctr += 1
+
+               # end of one example processing 
+
+            # call solver every nth example //added constraint
+            if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
+               objValue,w,self.slacks = solver.solve()
+               solver_call_ctr += 1
+
+               if solver_call_ctr == 5:
+                  numConstPerRound = 200
+                  self.plog('numConstPerRound is now %d\n'% numConstPerRound)
+
+               if math.fabs(objValue - self.oldObjValue) <= 1e-6:
+                  self.noImprovementCtr += 1
+
+               if self.noImprovementCtr == numExamples+1:
+                  break
+
+               self.oldObjValue = objValue
+               print "objValue is %f" % objValue
+
+               sum_xis = 0
+               for elem in self.slacks:
+                  sum_xis +=  elem
+
+               print 'sum of slacks is %f'% sum_xis 
+               self.plog('sum of slacks is %f\n'% sum_xis)
+   
+               for i in range(len(param)):
+                  param[i] = w[i]
+
+               cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
+               param_idx += 1
+               [h,d,a,mmatrix,qualityPlifs] =\
+               set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
+
+         ##############################################
+         # end of one iteration through all examples  #
+         ##############################################
+
+         self.plog("suboptimal rounds %d\n" %suboptimal_example)
+
+         if self.noImprovementCtr == numExamples*2:
+            FinalizeTraining(param,'param_%d.pickle'%param_idx)
+
+         iteration_nr += 1
+
+      #
+      # end of optimization 
+      #  
+      FinalizeTraining(param,'param_%d.pickle'%param_idx)
+
+
+   def FinalizeTraining(self,vector,name):
+      self.plog("Training completed")
+      cPickle.dump(param,open(name,'w+'))
+      self.logfh.close()
+   
+
+###############################################################################
+#
+# End of the code needed for training 
+#
+# Begin of code for prediction
+#
+###############################################################################
+
+
+   def init_prediction(self,dataset_fn,prediction_keys,param_fn,set_name):
+      """
+      Performing a prediction takes...
+      """
+      self.set_name = set_name
+
+      #full_working_path = os.path.join(run['alignment_dir'],run['name'])
+      full_working_path = self.run['result_dir']
+
+      print 'full_working_path is %s' % full_working_path 
+
+      #assert not os.path.exists(full_working_path)
+      if not os.path.exists(full_working_path):
+         os.mkdir(full_working_path)
+
+      assert os.path.exists(full_working_path)
+
+      # ATTENTION: Changing working directory
+      os.chdir(full_working_path)
+
+      self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
+
+      # number of prediction instances
+      self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
+
+      # load dataset and fetch instances that shall be predicted
+      dataset = cPickle.load(open(dataset_fn))
+
+      prediction_set = {}
+      for key in prediction_keys:
+         prediction_set[key] = dataset[key]
+
+      # we do not need the full dataset anymore
+      del dataset
+   
+      # Load parameter vector to predict with
+      param = cPickle.load(open(param_fn))
+
+      self.predict(prediction_set,param)
+
+
+   def predict(self,prediction_set,param):
+      """
+      This method...
+      """
+
+      if self.run['mode'] == 'normal':
+         self.use_quality_scores = False
+
+      elif self.run['mode'] == 'using_quality_scores':
+         self.use_quality_scores = True
+      else:
+         assert(False)
+
+      # Set the parameters such as limits/penalties for the Plifs
+      [h,d,a,mmatrix,qualityPlifs] =\
+      set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
+
+      if not self.qpalma_debug_mode:
+         self.plog('Starting prediction...\n')
+
+      self.problem_ctr = 0
+
+      # where we store the predictions
+      allPredictions = []
+
+      # we take the first quality vector of the tuple of quality vectors
+      quality_index = 0
+
+      # beginning of the prediction loop
+      for example_key in prediction_set.keys():
+         print 'Current example %d' % example_key
+         for example in prediction_set[example_key]:
+
+            currentSeqInfo,read,currentQualities = example
+
+            id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
+            currentSeqInfo 
+
+            if not self.qpalma_debug_mode:
+               self.plog('Loading example id: %d...\n'% int(id))
+
+            if self.run['enable_quality_scores']:
+               quality = currentQualities[quality_index]
+            else:
+               quality = [40]*len(read)
+
+            try:
+               currentDNASeq, currentAcc, currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
+            except:
+               self.problem_ctr += 1
+               continue
+
+            if not self.run['enable_splice_signals']:
+               for idx,elem in enumerate(currentDon):
+                  if elem != -inf:
+                     currentDon[idx] = 0.0
+
+               for idx,elem in enumerate(currentAcc):
+                  if elem != -inf:
+                     currentAcc[idx] = 0.0
+
+            current_prediction = self.calc_alignment(currentDNASeq, read,\
+            quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
+
+            current_prediction['id']         = id
+            current_prediction['chr']        = chromo
+            current_prediction['strand']     = strand
+            current_prediction['start_pos']  = genomicSeq_start
+
+            allPredictions.append(current_prediction)
+
+      if not self.qpalma_debug_mode:
+         self.FinalizePrediction(allPredictions)
+      else:
+         return allPredictions
+
+
+   def FinalizePrediction(self,allPredictions):
+      """ End of the prediction loop we save all predictions in a pickle file and exit """
+
+      cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
+      self.plog('Prediction completed\n')
+      mes =  'Problem ctr %d' % self.problem_ctr
+      self.plog(mes+'\n')
+      self.logfh.close()
+
+
+   def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
+      """
+      Given two sequences and the parameters we calculate on alignment
+      """
+
+      run = self.run
+      donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
+
+      if '-' in read:
+         self.plog('found gap\n')
+         read = read.replace('-','')
+         assert len(read) == Conf.read_size
+
+      ps = h.convert2SWIG()
+
+      newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
+      newQualityPlifsFeatures, dna_array, read_array =\
+      self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
+
+      mm_len = run['matchmatrixRows']*run['matchmatrixCols']
+
+      true_map    = [0]*2
+      true_map[0] = 1
+      pathNr      = 0
+
+      _newSpliceAlign   = array.array('B',newSpliceAlign)
+      _newEstAlign      = array.array('B',newEstAlign)
+       
+      #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
+      alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array) 
+
+      dna_array   = array.array('B',dna_array)
+      read_array  = array.array('B',read_array)
+
+      newExons = self.calculatePredictedExons(newSpliceAlign)
+
+      current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
+      'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
+      'dna_array':dna_array, 'read_array':read_array }
+
+      return current_prediction
+
+
+   def calculatePredictedExons(self,SpliceAlign):
+      newExons = []
+      oldElem = -1
+      SpliceAlign.append(-1)
+      for pos,elem in enumerate(SpliceAlign):
+         if pos == 0:
+            oldElem = -1
+         else:
+            oldElem = SpliceAlign[pos-1]
+
+         if oldElem != 0 and elem == 0: # start of exon
+            newExons.append(pos)
+
+         if oldElem == 0 and elem != 0: # end of exon
+            newExons.append(pos)
+
+      return newExons
diff --git a/scripts/Evaluation.py b/scripts/Evaluation.py
deleted file mode 100644 (file)
index 952eb4c..0000000
+++ /dev/null
@@ -1,929 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import cPickle
-import sys
-import pdb
-import os
-import os.path
-import math
-
-from qpalma.parsers import *
-
-
-data = None
-
-
-def result_statistic():
-   """
-
-   """   
-
-   num_unaligned_reads = 0
-   num_incorrectly_aligned_reads = 0
-   pass
-
-   
-def createErrorVSCutPlot(results):
-   """
-   This function takes the results of the evaluation and creates a tex table.
-   """
-
-   fh = open('error_rates_table.tex','w+')
-   lines = ['\\begin{tabular}{|c|c|c|r|}', '\hline',\
-   'Quality & Splice & Intron & \multicolumn{1}{c|}{Error on Positions} & \multicolumn{1}{c|}{Error on Scores} & \\',\
-   'information & site pred. & length & \multicolumn{1}{c|}{rate}\\', '\hline']
-
-   #for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
-   for pos,key in enumerate(['+++']):
-      res = results[key]
-      for i in range(37):
-         ctr = 0
-         try:
-            ctr = res[1][i]
-         except:
-            ctr = 0
-
-         lines.append( '%d\n' % ctr)
-
-      if pos % 2 == 1:
-         lines.append('\hline')
-
-   lines.append('\end{tabular}')
-
-   lines = [l+'\n' for l in lines]
-   for l in lines:
-      fh.write(l)
-   fh.close()
-
-
-def createTable(results):
-   """
-   This function takes the results of the evaluation and creates a tex table.
-   """
-
-   fh = open('result_table.tex','w+')
-   lines = ['\\begin{tabular}{|c|c|c|r|}', '\hline',\
-   'Quality & Splice & Intron & \multicolumn{1}{c|}{Error on Positions} & \multicolumn{1}{c|}{Error on Scores} & \\',\
-   'information & site pred. & length & \multicolumn{1}{c|}{rate}\\', '\hline']
-
-   for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
-      res = [e*100 for e in results[key]]
-   
-      lines.append( '%s & %s & %s & %2.2f & %2.2f \\%%\\\\' % ( key[0], key[1], key[2], res[0], res[1] ) )
-      if pos % 2 == 1:
-         lines.append('\hline')
-
-   for pos,key in enumerate(['---','+--','-+-','++-','--+','+-+','-++','+++']):
-      res = [e*100 for e in results[key]]
-   
-      lines.append( '%s & %s & %s & %2.2f & x \\%%\\\\' % ( key[0], key[1], key[2], res[2] ) )
-      if pos % 2 == 1:
-         lines.append('\hline')
-
-   lines.append('\end{tabular}')
-
-   lines = [l+'\n' for l in lines]
-   for l in lines:
-      fh.write(l)
-   fh.close()
-
-
-def compare_scores_and_labels(scores,labels):
-   """
-   Iterate through all predictions. If we find a correct prediction check
-   whether this correct prediction scores higher than the incorrect
-   predictions for this example.
-   """
-
-   for currentPos,currentElem in enumerate(scores):
-      if labels[currentPos] == True:
-         for otherPos,otherElem in enumerate(scores):
-            if otherPos == currentPos:
-               continue
-
-            if labels[otherPos] == False and otherElem >= currentElem:
-               return False
-
-   return True
-
-
-def compare_exons(predExons,trueExons):
-   e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
-
-   if len(predExons) == 4:
-      e1_begin,e1_end = predExons[0],predExons[1]
-      e2_begin,e2_end = predExons[2],predExons[3]
-   else:
-      return False
-
-   e1_b_off = int(math.fabs(e1_begin - trueExons[0,0]))
-   e1_e_off = int(math.fabs(e1_end - trueExons[0,1]))
-
-   e2_b_off = int(math.fabs(e2_begin - trueExons[1,0]))
-   e2_e_off = int(math.fabs(e2_end - trueExons[1,1]))
-
-   if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0\
-   and e2_e_off == 0:
-      return True
-
-   return False
-
-
-def evaluate_unmapped_example(current_prediction):
-   predExons = current_prediction['predExons']
-   trueExons = current_prediction['trueExons']
-
-   result = compare_exons(predExons,trueExons)
-   return result
-
-
-def evaluate_example(current_prediction):
-   label = False
-   label = current_prediction['label'] 
-
-   pred_score = current_prediction['DPScores'].flatten().tolist()[0][0]
-
-   # if the read was mapped by vmatch at an incorrect position we only have to
-   # compare the score
-   if label == False:
-      return label,False,pred_score
-
-   predExons = current_prediction['predExons']
-   trueExons = current_prediction['trueExons']
-
-   predPositions = [elem + current_prediction['alternative_start_pos'] for elem in predExons]
-   truePositions = [elem + current_prediction['start_pos'] for elem in trueExons.flatten().tolist()[0]]
-
-   pos_comparison = (predPositions == truePositions)
-
-   return label,pos_comparison,pred_score
-
-
-def prediction_on(filename):
-
-   allPredictions = cPickle.load(open(filename))
-
-   gt_correct_ctr = 0
-   gt_incorrect_ctr = 0
-   incorrect_gt_cuts       = {}
-
-   pos_correct_ctr   = 0
-   pos_incorrect_ctr = 0
-   incorrect_vmatch_cuts   = {}
-
-   score_correct_ctr = 0
-   score_incorrect_ctr = 0
-
-   total_gt_examples = 0
-   total_vmatch_instances_ctr = 0
-
-   true_vmatch_instances_ctr = 0
-
-   allUniquePredictions = [False]*len(allPredictions)
-
-   for pos,current_example_pred in enumerate(allPredictions):
-      for elem_nr,new_prediction in enumerate(current_example_pred[1:]):
-
-         if allUniquePredictions[pos] != False:
-            current_prediction = allUniquePredictions[pos]
-
-            current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
-            new_score =  new_prediction['DPScores'].flatten().tolist()[0][0]
-
-            if current_a_score < new_score :
-               allUniquePredictions[id] = new_prediction
-
-         else:
-            allUniquePredictions[pos] = new_prediction
-
-   for current_pred in allUniquePredictions:
-      if current_pred == False:
-         continue
-
-   #for current_example_pred in allPredictions:
-      #gt_example = current_example_pred[0]
-      #gt_score = gt_example['DPScores'].flatten().tolist()[0][0]
-      #gt_correct = evaluate_unmapped_example(gt_example)
-
-      #exampleIdx = gt_example['exampleIdx']
-
-      #cut_pos = gt_example['true_cut']
-
-      #if gt_correct:
-      #   gt_correct_ctr += 1
-      #else:
-      #   gt_incorrect_ctr += 1
-
-      #   try:
-      #      incorrect_gt_cuts[cut_pos] += 1
-      #   except:
-      #      incorrect_gt_cuts[cut_pos] = 1
-
-      #total_gt_examples += 1
-     
-      #current_scores = []
-      #current_labels = []
-      #for elem_nr,current_pred in enumerate(current_example_pred[1:]):
-
-      current_label,comparison_result,current_score = evaluate_example(current_pred)
-
-      # if vmatch found the right read pos we check for right exons
-      # boundaries
-      #if current_label:
-      if comparison_result:
-         pos_correct_ctr += 1
-      else:
-         pos_incorrect_ctr += 1
-
-            #try:
-            #   incorrect_vmatch_cuts[cut_pos] += 1
-            #except:
-            #   incorrect_vmatch_cuts[cut_pos] = 1
-
-         true_vmatch_instances_ctr += 1
-         
-      #current_scores.append(current_score)
-      #current_labels.append(current_label)
-
-      total_vmatch_instances_ctr += 1
-
-      # check whether the correct predictions score higher than the incorrect
-      # ones
-      #cmp_res = compare_scores_and_labels(current_scores,current_labels)
-      #if cmp_res:
-      #   score_correct_ctr += 1
-      #else:
-      #   score_incorrect_ctr += 1
-
-   # now that we have evaluated all instances put out all counters and sizes
-   print 'Total num. of examples: %d' % len(allPredictions)
-   print 'Number of correct ground truth examples: %d' % gt_correct_ctr
-   print 'Total num. of true vmatch instances %d' % true_vmatch_instances_ctr
-   print 'Correct pos: %d, incorrect pos: %d' % (pos_correct_ctr,pos_incorrect_ctr)
-   print 'Total num. of vmatch instances %d' % total_vmatch_instances_ctr
-   print 'Correct scores: %d, incorrect scores: %d' %\
-   (score_correct_ctr,score_incorrect_ctr)
-
-   pos_error   = 1.0 * pos_incorrect_ctr / total_vmatch_instances_ctr
-   score_error = 1.0 * score_incorrect_ctr / total_vmatch_instances_ctr
-   gt_error    = 1.0 * gt_incorrect_ctr / total_gt_examples
-
-   return (pos_error,score_error,gt_error,incorrect_gt_cuts,incorrect_vmatch_cuts)
-
-
-def collect_prediction(current_dir,run_name):
-   """
-   Given the toplevel directory this function takes care that for each distinct
-   experiment the training and test predictions are evaluated.
-
-   """
-   idx = 5
-
-   train_suffix   = '_%d_allPredictions_TRAIN' % (idx)
-   test_suffix    = '_%d_allPredictions_TEST' % (idx)
-
-   jp = os.path.join
-   b2s = ['-','+']
-
-   currentRun = cPickle.load(open(jp(current_dir,'run_object_%d.pickle'%(idx))))
-   QFlag    = currentRun['enable_quality_scores']
-   SSFlag   = currentRun['enable_splice_signals']
-   ILFlag   = currentRun['enable_intron_length']
-   currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
-   
-   #filename =  jp(current_dir,run_name)+train_suffix
-   #print 'Prediction on: %s' % filename
-   #train_result = prediction_on(filename)
-   train_result = []
-
-   filename =  jp(current_dir,run_name)+test_suffix
-   print 'Prediction on: %s' % filename
-   test_result = prediction_on(filename)
-
-   return train_result,test_result,currentRunId
-
-
-def perform_prediction(current_dir,run_name):
-   """
-   This function takes care of starting the jobs needed for the prediction phase
-   of qpalma
-   """
-
-   #for i in range(1,6):
-   for i in range(1,2):
-      cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s %d |\
-      qsub -l h_vmem=12.0G -cwd -j y -N \"%s_%d.log\"'%(current_dir,i,run_name,i)
-
-      #cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
-      #print cmd
-      os.system(cmd)
-
-
-
-def forall_experiments(current_func,tl_dir):
-   """
-   Given the toplevel directoy this function calls for each subdir the
-   function given as first argument. Which are at the moment:
-
-   - perform_prediction, and
-   - collect_prediction.
-   
-   """
-
-   dir_entries = os.listdir(tl_dir)
-   dir_entries = [os.path.join(tl_dir,de) for de in dir_entries]
-   run_dirs = [de for de in dir_entries if os.path.isdir(de)]
-
-   all_results = {}
-   all_error_rates = {}
-
-   for current_dir in run_dirs:
-      run_name = current_dir.split('/')[-1]
-
-      pdb.set_trace()
-
-      if current_func.__name__ == 'perform_prediction':
-         current_func(current_dir,run_name)
-
-      if current_func.__name__ == 'collect_prediction':
-         train_result,test_result,currentRunId = current_func(current_dir,run_name)
-         all_results[currentRunId] = test_result
-         pos_error,score_error,gt_error,incorrect_gt_cuts,incorrect_vmatch_cuts = test_result
-         all_error_rates[currentRunId] = (incorrect_gt_cuts,incorrect_vmatch_cuts)
-
-   if current_func.__name__ == 'collect_prediction':
-      #createErrorVSCutPlot(all_error_rates)
-      createTable(all_results)
-
-def _predict_on(filename,filtered_reads,with_coverage):
-   """
-   This function evaluates the predictions  made by QPalma.
-   It needs a pickled file containing the predictions themselves and the
-   ascii file with original reads.
-
-   Optionally one can specifiy a coverage file containing for each read the
-   coverage number estimated by a remapping step.
-
-   """
-
-   coverage_map = {}
-
-   if with_coverage:
-      for line in open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/coverage_results/ALL_COVERAGES'):
-         id,coverage_nr = line.strip().split()
-         coverage_map[int(id)] = int(coverage_nr)
-
-   print 'parsing filtered reads..'
-   all_filtered_reads = parse_filtered_reads(filtered_reads)
-   print 'found %d filtered reads' % len(all_filtered_reads)
-
-   out_fh = open('predicted_positions.txt','w+')
-
-   allPredictions = cPickle.load(open(filename))
-
-   spliced_ctr = 0
-   unspliced_ctr = 0
-
-   pos_correct_ctr   = 0
-   pos_incorrect_ctr = 0
-
-   correct_spliced_ctr     = 0
-   correct_unspliced_ctr   = 0
-
-   incorrect_spliced_ctr     = 0
-   incorrect_unspliced_ctr   = 0
-
-   correct_covered_splice_ctr = 0
-   incorrect_covered_splice_ctr = 0
-
-   total_vmatch_instances_ctr = 0
-
-   unspliced_spliced_reads_ctr = 0
-   wrong_spliced_reads_ctr = 0
-
-   wrong_aligned_unspliced_reads_ctr = 0
-   wrong_unspliced_reads_ctr = 0
-
-   cut_pos_ctr = {}
-
-   total_ctr = 0
-   skipped_ctr = 0
-
-   is_spliced = False
-   min_coverage = 3
-
-   allUniqPredictions = {}
-
-   print 'Got %d predictions' % len(allPredictions)
-
-   for new_prediction in allPredictions:
-      id = new_prediction['id']
-      id = int(id)
-
-      if allUniqPredictions.has_key(id):
-         current_prediction = allUniqPredictions[id]
-
-         current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
-         new_score =  new_prediction['DPScores'].flatten().tolist()[0][0]
-
-         if current_a_score < new_score :
-            allUniqPredictions[id] = new_prediction
-
-      else:
-         allUniqPredictions[id] = new_prediction
-
-   print 'Got %d uniq predictions' % len(allUniqPredictions)
-
-   #for current_prediction in allPredictions:
-   for _id,current_prediction in allUniqPredictions.items():
-      id = current_prediction['id']
-      id = int(id)
-
-      if not id >= 1000000300000:
-         is_spliced = True
-      else:
-         is_spliced = False
-
-      is_covered = False
-
-      if is_spliced and with_coverage:
-         try:
-            current_coverage_nr = coverage_map[id]
-            is_covered = True
-         except:
-            is_covered = False
-
-
-      if is_spliced:
-         spliced_ctr += 1
-      else: 
-         unspliced_ctr += 1
-   
-      try:
-         current_ground_truth = all_filtered_reads[id]
-      except:
-         skipped_ctr += 1
-         continue
-
-      start_pos = current_prediction['start_pos']
-      chr = current_prediction['chr']
-      strand = current_prediction['strand']
-
-      #score = current_prediction['DPScores'].flatten().tolist()[0][0]
-      #pdb.set_trace()
-
-      predExons = current_prediction['predExons'] #:newExons, 'dna':dna, 'est':est
-      predExons = [e+start_pos for e in predExons]
-
-      spliced_flag = False
-
-      if len(predExons) == 4:
-         spliced_flag = True
-         predExons[1] -= 1
-         predExons[3] -= 1
-
-         if predExons[0] == 19504568:
-            pdb.set_trace()
-
-         cut_pos = current_ground_truth['true_cut']
-         p_start = current_ground_truth['p_start']
-         e_stop = current_ground_truth['exon_stop']
-         e_start = current_ground_truth['exon_start']
-         p_stop = current_ground_truth['p_stop']
-
-         true_cut = current_ground_truth['true_cut']
-
-         if p_start == predExons[0] and e_stop == predExons[1] and\
-         e_start == predExons[2] and p_stop == predExons[3]:
-            pos_correct = True
-         else:
-            pos_correct = False
-
-      elif len(predExons) == 2:
-         spliced_flag = False
-         predExons[1] -= 1
-
-         cut_pos = current_ground_truth['true_cut']
-         p_start = current_ground_truth['p_start']
-         p_stop = current_ground_truth['p_stop']
-
-         true_cut = current_ground_truth['true_cut']
-
-         if math.fabs(p_start - predExons[0]) <= 0:# and math.fabs(p_stop - predExons[1]) <= 2:
-            pos_correct = True
-         else:
-            pos_correct = False
-
-      else:
-         pos_correct = False
-
-      if is_spliced and not spliced_flag:
-         unspliced_spliced_reads_ctr += 1
-
-      if is_spliced and not pos_correct and len(predExons) == 4 and predExons[1]!=-1:
-          wrong_spliced_reads_ctr += 1
-
-      if not is_spliced and spliced_flag:
-         wrong_unspliced_reads_ctr += 1
-
-      if not is_spliced and not pos_correct:
-         wrong_aligned_unspliced_reads_ctr += 1
-
-      if pos_correct:
-         pos_correct_ctr += 1
-
-         if is_spliced:
-               correct_spliced_ctr += 1
-               if with_coverage and is_covered and current_coverage_nr >= min_coverage:
-                  correct_covered_splice_ctr += 1 
-
-         if not is_spliced:
-               correct_unspliced_ctr += 1
-
-      else:
-         pos_incorrect_ctr += 1
-
-         if is_spliced:
-               incorrect_spliced_ctr += 1
-               if with_coverage and is_covered and current_coverage_nr >= min_coverage:
-                  incorrect_covered_splice_ctr += 1 
-
-         if not is_spliced:
-               incorrect_unspliced_ctr += 1
-
-      if with_coverage and spliced_flag:
-          if not is_covered:
-              current_coverage_nr=0 
-          if pos_correct:
-              print "%s\tcorrect\t%i" %( current_prediction['id'], current_coverage_nr)
-          else:
-              print "%s\twrong\t%i" %( current_prediction['id'], current_coverage_nr)
-
-      total_ctr += 1
-
-
-   numPredictions = len(allUniqPredictions)
-
-   # now that we have evaluated all instances put out all counters and sizes
-   print 'Total num. of examples: %d' % numPredictions
-
-   print "spliced/unspliced:  %d,%d " % (spliced_ctr, unspliced_ctr )
-   print "Correct/incorrect spliced: %d,%d " % (correct_spliced_ctr, incorrect_spliced_ctr )
-   print "Correct/incorrect unspliced: %d,%d " % (correct_unspliced_ctr , incorrect_unspliced_ctr )
-   print "Correct/incorrect covered spliced read: %d,%d " %\
-   (correct_covered_splice_ctr,incorrect_covered_splice_ctr)
-
-   print "pos_correct: %d,%d" % (pos_correct_ctr ,  pos_incorrect_ctr )
-
-   print 'unspliced_spliced reads: %d' % unspliced_spliced_reads_ctr 
-   print 'spliced reads at wrong_place: %d' % wrong_spliced_reads_ctr
-
-   print 'spliced_unspliced reads:  %d' % wrong_unspliced_reads_ctr
-   print 'wrong aligned at wrong_pos: %d' % wrong_aligned_unspliced_reads_ctr
-
-   print 'total_ctr: %d' % total_ctr
-
-   print "skipped: %d "  % skipped_ctr
-   print 'min. coverage: %d' % min_coverage
-
-   result_dict = {}
-   result_dict['skipped_ctr'] = skipped_ctr
-   result_dict['min_coverage'] = min_coverage
-
-   return result_dict
-
-
-
-
-
-def predict_on(allPredictions,all_filtered_reads,all_labels_fn,with_coverage,coverage_fn,coverage_labels_fn):
-   """
-   This function evaluates the predictions  made by QPalma.
-   It needs a pickled file containing the predictions themselves and the
-   ascii file with original reads.
-
-   Optionally one can specifiy a coverage file containing for each read the
-   coverage number estimated by a remapping step.
-
-
-   """
-
-   coverage_labels_fh = open(coverage_labels_fn,'w+')
-
-   all_labels_fh = open(all_labels_fn,'w+')
-
-   import qparser
-   qparser.parse_reads(all_filtered_reads)
-
-   coverage_map = {}
-
-
-   if with_coverage:
-      for line in open(coverage_fn):
-         id,coverage_nr = line.strip().split()
-         coverage_map[int(id)] = int(coverage_nr)
-
-   #out_fh = open('predicted_positions.txt','w+')
-
-   spliced_ctr = 0
-   unspliced_ctr = 0
-
-   pos_correct_ctr   = 0
-   pos_incorrect_ctr = 0
-
-   correct_spliced_ctr     = 0
-   correct_unspliced_ctr   = 0
-
-   incorrect_spliced_ctr     = 0
-   incorrect_unspliced_ctr   = 0
-
-   correct_covered_splice_ctr = 0
-   incorrect_covered_splice_ctr = 0
-
-   total_vmatch_instances_ctr = 0
-
-   unspliced_spliced_reads_ctr = 0
-   wrong_spliced_reads_ctr = 0
-
-   wrong_aligned_unspliced_reads_ctr = 0
-   wrong_unspliced_reads_ctr = 0
-
-   cut_pos_ctr = {}
-
-   total_ctr = 0
-   skipped_ctr = 0
-
-   is_spliced = False
-   min_coverage = 3
-
-   allUniqPredictions = {}
-
-   print 'Got %d predictions' % len(allPredictions)
-
-   for k,predictions in allPredictions.items():
-      for new_prediction in predictions:
-         id = new_prediction['id']
-         id = int(id)
-
-         if allUniqPredictions.has_key(id):
-            current_prediction = allUniqPredictions[id]
-
-            current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
-            new_score =  new_prediction['DPScores'].flatten().tolist()[0][0]
-
-            if current_a_score < new_score :
-               allUniqPredictions[id] = new_prediction
-
-         else:
-            allUniqPredictions[id] = new_prediction
-
-   print 'Got %d uniq predictions' % len(allUniqPredictions)
-
-   for _id,current_prediction in allUniqPredictions.items():
-      id = current_prediction['id']
-      id = int(id)
-
-      if not id >= 1000000300000:
-         is_spliced = True
-      else:
-         is_spliced = False
-
-      is_covered = False
-
-      if is_spliced and with_coverage:
-         try:
-            current_coverage_nr = coverage_map[id]
-            is_covered = True
-         except:
-            is_covered = False
-
-
-      if is_spliced:
-         spliced_ctr += 1
-      else: 
-         unspliced_ctr += 1
-   
-      try:
-         #current_ground_truth = all_filtered_reads[id]
-         current_ground_truth = qparser.fetch_read(id)
-      except:
-         skipped_ctr += 1
-         continue
-
-      start_pos = current_prediction['start_pos']
-      chr = current_prediction['chr']
-      strand = current_prediction['strand']
-
-      #score = current_prediction['DPScores'].flatten().tolist()[0][0]
-      #pdb.set_trace()
-
-      predExons = current_prediction['predExons'] #:newExons, 'dna':dna, 'est':est
-      predExons = [e+start_pos for e in predExons]
-
-      spliced_flag = False
-
-      if len(predExons) == 4:
-         spliced_flag = True
-         predExons[1] -= 1
-         predExons[3] -= 1
-
-         cut_pos = current_ground_truth['true_cut']
-         p_start = current_ground_truth['p_start']
-         e_stop = current_ground_truth['exon_stop']
-         e_start = current_ground_truth['exon_start']
-         p_stop = current_ground_truth['p_stop']
-
-         true_cut = current_ground_truth['true_cut']
-
-         if p_start == predExons[0] and e_stop == predExons[1] and\
-         e_start == predExons[2] and p_stop == predExons[3]:
-            pos_correct = True
-         else:
-            pos_correct = False
-
-      elif len(predExons) == 2:
-         spliced_flag = False
-         predExons[1] -= 1
-
-         cut_pos = current_ground_truth['true_cut']
-         p_start = current_ground_truth['p_start']
-         p_stop = current_ground_truth['p_stop']
-
-         true_cut = current_ground_truth['true_cut']
-
-         if math.fabs(p_start - predExons[0]) <= 0:# and math.fabs(p_stop - predExons[1]) <= 2:
-            pos_correct = True
-         else:
-            pos_correct = False
-
-      else:
-         pos_correct = False
-
-      if is_spliced and not spliced_flag:
-         unspliced_spliced_reads_ctr += 1
-
-      if is_spliced and not pos_correct and len(predExons) == 4 and predExons[1]!=-1:
-          wrong_spliced_reads_ctr += 1
-
-      if not is_spliced and spliced_flag:
-         wrong_unspliced_reads_ctr += 1
-
-      if not is_spliced and not pos_correct:
-         wrong_aligned_unspliced_reads_ctr += 1
-
-      if pos_correct:
-         pos_correct_ctr += 1
-
-         if is_spliced:
-               correct_spliced_ctr += 1
-               all_labels_fh.write('%d correct\n'%id)
-               if with_coverage and is_covered and current_coverage_nr >= min_coverage:
-                  correct_covered_splice_ctr += 1 
-
-         if not is_spliced:
-               correct_unspliced_ctr += 1
-
-      else:
-         pos_incorrect_ctr += 1
-
-         if is_spliced:
-               incorrect_spliced_ctr += 1
-               all_labels_fh.write('%d wrong\n'%id)
-               if with_coverage and is_covered and current_coverage_nr >= min_coverage:
-                  incorrect_covered_splice_ctr += 1 
-
-         if not is_spliced:
-               incorrect_unspliced_ctr += 1
-
-      if with_coverage:
-         if not is_covered:
-            current_coverage_nr=0 
-
-         if pos_correct:
-            new_line = "%s\tcorrect\t%i" %( current_prediction['id'], current_coverage_nr)
-         else:
-            new_line = "%s\twrong\t%i" %( current_prediction['id'], current_coverage_nr)
-
-         coverage_labels_fh.write(new_line+'\n')
-
-      total_ctr += 1
-
-   coverage_labels_fh.close()
-
-   numPredictions = len(allUniqPredictions)
-
-   result = []
-
-   # now that we have evaluated all instances put out all counters and sizes
-   result.append(('numPredictions',numPredictions))
-   result.append(('spliced_ctr',spliced_ctr))
-   result.append(('unspliced_ctr',unspliced_ctr))
-
-   result.append(('correct_spliced_ctr',correct_spliced_ctr))
-   result.append(('incorrect_spliced_ctr',incorrect_spliced_ctr))
-
-   result.append(('correct_unspliced_ctr',correct_unspliced_ctr))
-   result.append(('incorrect_unspliced_ctr',incorrect_unspliced_ctr))
-
-   result.append(('correct_covered_splice_ctr',correct_covered_splice_ctr))
-   result.append(('incorrect_covered_splice_ctr',incorrect_covered_splice_ctr))
-
-   result.append(('pos_correct_ctr',pos_correct_ctr))
-   result.append(('pos_incorrect_ctr',pos_incorrect_ctr))
-
-   result.append(('unspliced_spliced_reads_ctr',unspliced_spliced_reads_ctr))
-   result.append(('wrong_spliced_reads_ctr',wrong_spliced_reads_ctr))
-
-   result.append(('wrong_unspliced_reads_ctr',wrong_unspliced_reads_ctr))
-   result.append(('wrong_aligned_unspliced_reads_ctr',wrong_aligned_unspliced_reads_ctr))
-
-   result.append(('total_ctr',total_ctr))
-
-   result.append(('skipped_ctr',skipped_ctr))
-   result.append(('min_coverage',min_coverage))
-
-   return result
-
-
-
-def print_result(result):
-   # now that we have evaluated all instances put out all counters and sizes
-   for name,ctr in result:
-      print name,ctr
-
-
-def load_chunks(current_dir):
-   chunks_fn = []
-   for fn in os.listdir(current_dir):
-      if fn.startswith('chunk'):
-         chunks_fn.append(fn)
-
-   allPredictions = []
-
-   for c_fn in chunks_fn:
-      full_fn = os.path.join(current_dir,c_fn)
-      print full_fn
-      current_chunk = cPickle.load(open(full_fn))
-      allPredictions.extend(current_chunk)
-
-   return allPredictions
-
-
-def predict_on_all_chunks(current_dir,training_keys_fn):
-   """
-   We load all chunks from the current_dir belonging to one run.
-   Then we load the saved keys of the training set to restore the training and
-   testing sets.
-   Once we have done that we separately evaluate both sets.
-
-   """
-   
-   allPredictions = load_chunks(current_dir)
-
-   allPredictionsDict = {}
-   for elem in allPredictions:
-      id = elem['id']
-
-      if allPredictionsDict.has_key(id):
-         old_entry = allPredictionsDict[id]
-         old_entry.append(elem)
-         allPredictionsDict[id] = old_entry
-      else:
-         allPredictionsDict[id] = [elem]
-
-   training_keys = cPickle.load(open(training_keys_fn))
-
-   training_set = {}
-   for key in training_keys:
-      # we have the try construct because some of the reads used for training
-      # may not be found using vmatch at all
-      try:
-         training_set[key] = allPredictionsDict[key]
-         del allPredictionsDict[key]
-      except:
-         pass
-
-   test_set = allPredictionsDict
-
-   #test_set = {}
-   #for k in allPredictionsDict.keys()[:100]:
-   #   test_set[k] = allPredictionsDict[k]
-   #result_train = predict_on(training_set,all_filtered_reads,False,coverage_fn)
-   #pdb.set_trace()
-
-   # this is the heuristic.parsed_spliced_reads.txt file from the vmatch remapping step
-   coverage_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/all_coverages' 
-
-   all_filtered_reads = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
-
-   coverage_labels_fn = 'COVERAGE_LABELS'
-
-   result_test = predict_on(test_set,all_filtered_reads,'all_prediction_labels.txt',True,coverage_fn,coverage_labels_fn)
-   #print_result(result_train)
-
-   return result_test
-   
-
-if __name__ == '__main__':
-   pass
diff --git a/scripts/Experiment.py b/scripts/Experiment.py
deleted file mode 100644 (file)
index f26e7be..0000000
+++ /dev/null
@@ -1,213 +0,0 @@
-###############################################################################
-#
-# This file contains settings for one experiment
-#
-# The general idea is as follows:
-# 
-# Suppose you have an machine learning algorithm you want to perform model
-# selection with. Then for each different value of for example C for a C-SVM this
-# script generates a Run object a subclass of dict storing the parameters.
-#
-###############################################################################
-
-import QPalmaConfiguration as Conf
-from Run import *
-import pdb
-import os
-import os.path
-import cPickle
-
-def get_dataset(dataset_size,num_nodes):
-   all_instances = []
-
-   params = [\
-   ('prediction_begin',400000),\
-   ('prediction_end',440000)]
-
-   all_instances.append(params)
-
-   return all_instances
-
-
-def get_dataset_splitting_instances(dataset_size,num_nodes):
-   all_instances = []
-
-   part = dataset_size / num_nodes
-   begin = 0
-   end = 0
-   for idx in range(1,num_nodes+1):
-      
-      if idx == num_nodes:
-         begin = end
-         end   = dataset_size
-      else:
-         begin = end
-         end = begin+part
-
-      params = [\
-      ('prediction_begin',begin),\
-      ('prediction_end',end)]
-
-      all_instances.append(params)
-
-   return all_instances
-
-
-def get_training_instances():
-   all_instances = []
-
-   params = [\
-   ('enable_quality_scores',True),\
-   ('enable_splice_signals',True),\
-   ('enable_intron_length',True)]
-
-   all_instances.append(params)
-
-   return all_instances
-
-
-def get_scoring_instances():
-   all_instances = []
-
-   for QFlag in [True,False]:
-      for SSFlag in [True,False]:
-         for ILFlag in [True,False]:
-            params = [\
-            ('enable_quality_scores',QFlag),\
-            ('enable_splice_signals',SSFlag),\
-            ('enable_intron_length',ILFlag)]
-            all_instances.append(params)
-
-   return all_instances
-
-
-def createRuns():
-   # load the configuration object
-   Config = cPickle.load(open(Conf.conf_object_path))
-
-   # the main directory where all results are stored
-   alignment_dir   = Config['alignment_dir']
-   
-   # list of regularization parameters and additional flags for different runs
-   # for example:
-   #  - with quality scores
-   #  - without quality scores
-   #
-   bool2str = ['-','+']
-   ctr = 1
-
-   all_instances = get_training_instances()
-            
-   allRuns = []
-   for parameters in all_instances:
-      # create a new Run object
-      currentRun = Run()
-      currentName = 'run'
-
-      for param_name,param in parameters:
-         currentRun[param_name] = param
-         currentName += '_%s_%s' % (str(param_name),str(bool2str[param]))
-            
-      # global settings for all runs
-      currentRun['anzpath']            = Conf.anzpath
-      currentRun['iter_steps']         = Conf.iter_steps
-      currentRun['matchmatrixRows']    = Conf.sizeMatchmatrix[0]
-      currentRun['matchmatrixCols']    = Conf.sizeMatchmatrix[1]
-      currentRun['mode']               = Conf.mode
-      currentRun['numConstraintsPerRound'] = Conf.numConstraintsPerRound 
-
-      currentRun['remove_duplicate_scores']  = Conf.remove_duplicate_scores
-      currentRun['print_matrix']          = Conf.print_matrix
-      currentRun['read_size']             = Conf.read_size
-
-      currentRun['numDonSuppPoints']      = 10
-      currentRun['numAccSuppPoints']      = 10
-
-      currentRun['numQualPlifs']          = Conf.numQualPlifs
-      currentRun['numQualSuppPoints']     = 10
-      currentRun['totalQualSuppPoints']   = currentRun['numQualPlifs']*currentRun['numQualSuppPoints']
-
-      currentRun['enable_quality_scores'] = True
-      currentRun['enable_splice_signals'] = True
-      currentRun['enable_intron_length']  = True
-
-      # if we are not using an intron length model at all we do not need the support points
-      currentRun['numLengthSuppPoints']   = 10 #Conf.numLengthSuppPoints
-
-      if currentRun['enable_intron_length'] == False:
-         currentRun['numLengthSuppPoints']   = 2 #Conf.numLengthSuppPoints
-
-      currentRun['numFeatures']           = currentRun['numLengthSuppPoints']\
-      + currentRun['numDonSuppPoints'] + currentRun['numAccSuppPoints']\
-      + currentRun['matchmatrixRows'] * currentRun['matchmatrixCols']\
-      + currentRun['totalQualSuppPoints'] 
-
-      # run-specific settings
-      currentRun['C']                     = 100
-
-      currentRun['name']                  = currentName
-      currentRun['alignment_dir']         = alignment_dir
-
-      currentRun['min_intron_len']        = 20
-      currentRun['max_intron_len']        = 2000
-
-      currentRun['min_svm_score']         = 0.0 
-      currentRun['max_svm_score']         = 1.0
-
-      currentRun['min_qual']              = -5
-      currentRun['max_qual']              = 40
-
-      currentRun['dna_flat_files']        = Conf.dna_flat_fn
-
-      currentRun['id']      = ctr
-      ctr += 1
-
-      allRuns.append(currentRun)
-
-###############################################################################
-#
-# check for valid paths / options etc
-#
-###############################################################################
-
-   for currentRun in allRuns:
-
-      assert 0 < currentRun['anzpath'] < 100
-      assert currentRun['iter_steps']
-
-      #assert currentRun['matchmatrixCols']
-      #assert currentRun['matchmatrixRows']
-
-      assert currentRun['mode'] in ['normal','using_quality_scores']
-
-      #assert currentRun['numConstraintsPerRound']
-
-      assert 0 < currentRun['numFeatures'] < 10000
-
-      # assert currentRun['numLengthSuppPoints']
-      # assert currentRun['numDonSuppPoints']
-      # assert currentRun['numAccSuppPoints']
-      #assert currentRun['numQualPlifs']
-      #assert currentRun['numQualSuppPoints']
-      #assert numQualPlifs       >= 0
-      #assert numDonSuppPoints    > 1
-      #assert numAccSuppPoints    > 1
-      #assert numLengthSuppPoints > 1 
-      #assert numQualSuppPoints   > 1
-
-      assert currentRun['print_matrix'] in [True,False]
-      assert 0 < currentRun['read_size'] < 100
-      assert currentRun['remove_duplicate_scores'] in [True,False]
-
-      assert currentRun['enable_quality_scores'] in [True,False]
-      assert currentRun['enable_splice_signals'] in [True,False]
-      assert currentRun['enable_intron_length']  in [True,False]
-
-      #assert currentRun['totalQualSuppPoints']
-      assert os.path.exists(currentRun['alignment_dir'])
-
-   return allRuns
-
-if __name__ == '__main__':
-   allRuns = createRuns()
-   #pdb.set_trace()
diff --git a/scripts/ModelSelection.py b/scripts/ModelSelection.py
deleted file mode 100644 (file)
index ebc2baf..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python
-# -*- coding:latin-1 -*-
-
-import cPickle
-import os
-import os.path
-import time
-import pdb
-import subprocess
-
-from qpalma_main import QPalma
-import Experiment as Exp
-
-class Model:
-
-   allInstances = []
-   
-   def __init__(self):
-      pass
-
-   def createInstances(self):
-
-      allRuns = Exp.createRuns()
-      
-      #pdb.set_trace()
-      
-      for currentRun in allRuns:
-               
-         currentInstance = QPalma(currentRun)
-
-         #print 'instance created, starting to pickle configuration...'
-         name = currentRun['name']
-         instance_fn = '%s.pickle'%name
-         fh = open(instance_fn,'w+')
-         cPickle.dump(currentInstance,fh)
-         fh.close()
-
-         self.allInstances.append(instance_fn)
-
-
-   def doSelection(self):
-      for instance in self.allInstances:
-         time.sleep(2)
-         cmd = 'echo ./resurrect %s | qsub -l h_vmem=5.0G -cwd -j y -N \"%s.log\"'%(instance,instance)
-         #print cmd
-         os.system(cmd)
-
-         #os.system('echo "./resurrect %s >out_%s.log 2>err_%s.log &"'%(instance,instance,instance))
-
-
-   def doPrediction(self):
-      current_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_single_run'
-
-      for instance in self.allInstances:
-         time.sleep(2)
-         cmd = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s 1>_%s.log 2>%s.log & '%(instance,instance,instance)
-         #cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s |\
-         #qsub -l h_vmem=12.0G -cwd -j y -N \"%s.log\"'%(instance,instance)
-         #os.system(cmd) 
-         obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-         #out,err = obj.communicate()
-         #print out,err
-
-
-
-if __name__ == '__main__':
-   m = Model()
-   m.createInstances()
-   m.doSelection()
-   #m.doPrediction()
-
diff --git a/scripts/PipelineHeuristic.py b/scripts/PipelineHeuristic.py
deleted file mode 100644 (file)
index 3fd1418..0000000
+++ /dev/null
@@ -1,618 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*- 
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# Written (W) 2008 Gunnar Rätsch, Fabio De Bona
-# Copyright (C) 2008 Max-Planck-Society
-
-import cPickle
-import sys
-import pdb
-import os
-import os.path
-import resource
-
-from qpalma.computeSpliceWeights import *
-from qpalma.set_param_palma import *
-from qpalma.computeSpliceAlignWithQuality import *
-
-from numpy.matlib import mat,zeros,ones,inf
-from numpy import inf,mean
-
-from ParaParser import ParaParser,IN_VECTOR
-
-from qpalma.Lookup import LookupTable
-from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement,unbracket_seq
-
-
-def cpu():
-   return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+\
-   resource.getrusage(resource.RUSAGE_SELF).ru_stime) 
-
-
-class BracketWrapper:
-   fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
-
-   def __init__(self,filename):
-      self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
-      self.parser.parseFile(filename)
-
-   def __len__(self):
-      return self.parser.getSize(IN_VECTOR)
-      
-   def __getitem__(self,key):
-      return self.parser.fetchEntry(key)
-
-   def __iter__(self):
-      self.counter = 0
-      self.size = self.parser.getSize(IN_VECTOR)
-      return self
-
-   def next(self):
-      if not self.counter < self.size:
-         raise StopIteration
-      return_val = self.parser.fetchEntry(self.counter)
-      self.counter += 1
-      return return_val
-
-
-class PipelineHeuristic:
-   """
-   This class wraps the filter which decides whether an alignment found by
-   vmatch is spliced an should be then newly aligned using QPalma or not.
-   """
-
-   def __init__(self,run_fname,data_fname,param_fname,result_fname,settings):
-      """
-      We need a run object holding information about the nr. of support points
-      etc.
-      """
-
-      run = cPickle.load(open(run_fname))
-      self.run = run
-
-      start = cpu()
-      self.all_remapped_reads = BracketWrapper(data_fname)
-      stop = cpu()
-
-      print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
-
-      self.chromo_range = settings['allowed_fragments']
-
-      accessWrapper = DataAccessWrapper(settings)
-      seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
-
-      start = cpu()
-      self.lt1 = LookupTable(settings)
-      stop = cpu()
-
-      print 'prefetched sequence and splice data in %f sec' % (stop-start)
-
-      self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
-      self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
-
-      start = cpu()
-
-      self.data_fname = data_fname
-
-      self.param = cPickle.load(open(param_fname))
-      
-      # Set the parameters such as limits penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(self.param,True,run)
-
-      self.h = h
-      self.d = d
-      self.a = a
-      self.mmatrix = mmatrix
-      self.qualityPlifs = qualityPlifs
-
-      self.read_size    = 38
-      self.prb_offset   = 64
-      #self.prb_offset   = 50
-
-      # parameters of the heuristics to decide whether the read is spliced
-      #self.splice_thresh      = 0.005
-      self.splice_thresh      = 0.01
-      self.max_intron_size    = 2000
-      self.max_mismatch       = 2 
-      #self.splice_stop_thresh = 0.99
-      self.splice_stop_thresh = 0.8
-      self.spliced_bias       = 0.0
-
-      param_lines = [\
-      ('%f','splice_thresh',self.splice_thresh),\
-      ('%d','max_intron_size',self.max_intron_size),\
-      ('%d','max_mismatch',self.max_mismatch),\
-      ('%f','splice_stop_thresh',self.splice_stop_thresh),\
-      ('%f','spliced_bias',self.spliced_bias)]
-
-      param_lines = [('# %s: '+form+'\n')%(name,val) for form,name,val in param_lines]
-      param_lines.append('# data: %s\n'%self.data_fname)
-      param_lines.append('# param: %s\n'%param_fname)
-
-      #pdb.set_trace()
-
-      for p_line in param_lines:
-         self.result_spliced_fh.write(p_line)
-         self.result_unspliced_fh.write(p_line)
-   
-      #self.original_reads = {}
-
-      # we do not have this information
-      #for line in open(reads_pipeline_fn):
-      #   line = line.strip()
-      #   id,seq,q1,q2,q3 = line.split()
-      #   id = int(id)
-      #   self.original_reads[id] = seq
-
-      lengthSP    = run['numLengthSuppPoints']
-      donSP       = run['numDonSuppPoints']
-      accSP       = run['numAccSuppPoints']
-      mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
-      numq        = run['numQualSuppPoints']
-      totalQualSP = run['totalQualSuppPoints']
-
-      currentPhi = zeros((run['numFeatures'],1))
-      currentPhi[0:lengthSP]                                            = mat(h.penalties[:]).reshape(lengthSP,1)
-      currentPhi[lengthSP:lengthSP+donSP]                               = mat(d.penalties[:]).reshape(donSP,1)
-      currentPhi[lengthSP+donSP:lengthSP+donSP+accSP]                   = mat(a.penalties[:]).reshape(accSP,1)
-      currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP]   = mmatrix[:]
-
-      totalQualityPenalties = self.param[-totalQualSP:]
-      currentPhi[lengthSP+donSP+accSP+mmatrixSP:]                    = totalQualityPenalties[:]
-      self.currentPhi = currentPhi
-
-      # we want to identify spliced reads 
-      # so true pos are spliced reads that are predicted "spliced"
-      self.true_pos  = 0
-      
-      # as false positives we count all reads that are not spliced but predicted
-      # as "spliced"
-      self.false_pos = 0
-
-      self.true_neg  = 0
-      self.false_neg = 0
-
-      # total time spend for get seq and scores
-      self.get_time  = 0.0
-      self.calcAlignmentScoreTime = 0.0
-      self.alternativeScoresTime = 0.0
-
-      self.count_time = 0.0
-      self.main_loop = 0.0
-      self.splice_site_time = 0.0
-      self.computeSpliceAlignWithQualityTime = 0.0
-      self.computeSpliceWeightsTime = 0.0
-      self.DotProdTime = 0.0
-      self.array_stuff = 0.0
-      stop = cpu()
-
-      self.init_time = stop-start
-
-   def filter(self):
-      """
-      This method...
-      """
-      run = self.run 
-
-      ctr = 0
-      unspliced_ctr  = 0
-      spliced_ctr    = 0
-
-      print 'Starting filtering...'
-      _start = cpu()
-
-      for location,original_line in self.all_remapped_reads:
-         
-         if ctr % 1000 == 0:
-            print ctr
-
-         id       = location['id']
-         chr      = location['chr']
-         pos      = location['pos']
-         strand   = location['strand']
-         seq      = location['seq']
-         prb      = location['prb']
-
-         id = int(id)
-
-         seq = seq.lower()
-
-         strand_map = {'D':'+', 'P':'-'}
-
-         strand = strand_map[strand]
-
-         if not chr in self.chromo_range:
-            continue
-
-         unb_seq = unbracket_seq(seq)
-
-         # forgot to do this
-         if strand == '-':
-            unb_seq = reverse_complement(unb_seq)
-            seq = reverse_complement(seq)
-
-         effective_len = len(unb_seq)
-
-         genomicSeq_start  = pos
-         #genomicSeq_stop   = pos+effective_len-1
-         genomicSeq_stop   = pos+effective_len
-
-         start = cpu()
-         #currentDNASeq_, currentAcc_, currentDon_ = self.seqInfo.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,False)
-         currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop)
-
-         # just performed some tests to check LookupTable results
-         #assert currentDNASeq_ == currentDNASeq
-         #assert currentAcc_ == currentAcc_
-         #assert currentDon_ == currentDon_
-
-         stop = cpu()
-         self.get_time += stop-start
-
-         dna            = currentDNASeq
-         exons          = zeros((2,1))
-         exons[0,0]     = 0
-         exons[1,0]     = effective_len
-         est            = unb_seq
-         original_est   = seq
-         quality        = map(lambda x:ord(x)-self.prb_offset,prb)
-
-         currentVMatchAlignment = dna, exons, est, original_est, quality,\
-         currentAcc, currentDon
-
-         #pdb.set_trace()
-
-         #alternativeAlignmentScores = self.calcAlternativeAlignments(location)
-
-         try:
-            alternativeAlignmentScores = self.calcAlternativeAlignments(location)
-         except:
-            alternativeAlignmentScores = []
-
-         if alternativeAlignmentScores == []:
-             # no alignment necessary
-             maxAlternativeAlignmentScore = -inf
-             vMatchScore = 0.0 
-         else:
-             maxAlternativeAlignmentScore = max(alternativeAlignmentScores)
-             # compute alignment for vmatch unspliced read
-             vMatchScore = self.calcAlignmentScore(currentVMatchAlignment)
-             #print 'vMatchScore/alternativeScore: %f %f %s' % (vMatchScore,maxAlternativeAlignmentScore,strand)
-             
-
-         start = cpu()
-         
-         #print 'all candidates %s' % str(alternativeAlignmentScores)
-
-         new_id = id - 1000000300000
-
-         unspliced = False
-         # unspliced 
-         if new_id > 0: 
-            unspliced = True
-
-         # Seems that according to our learned parameters VMatch found a good
-         # alignment of the current read
-         if maxAlternativeAlignmentScore < vMatchScore:
-            unspliced_ctr += 1
-
-            self.result_unspliced_fh.write(original_line+'\n') 
-
-            if unspliced:
-               self.true_neg += 1
-            else:
-               self.false_neg += 1
-
-         # We found an alternative alignment considering splice sites that scores
-         # higher than the VMatch alignment
-         else:
-            spliced_ctr += 1
-
-            self.result_spliced_fh.write(original_line+'\n')
-
-            if unspliced:
-               self.false_pos += 1
-            else:
-               self.true_pos += 1
-
-         ctr += 1
-         stop = cpu()
-         self.count_time = stop-start
-
-      _stop = cpu()
-      self.main_loop = _stop-_start
-
-      print 'Unspliced/Splice: %d %d'%(unspliced_ctr,spliced_ctr)
-      print 'True pos / false pos : %d %d'%(self.true_pos,self.false_pos)
-      print 'True neg / false neg : %d %d'%(self.true_neg,self.false_neg)
-
-
-   def findHighestScoringSpliceSites(self, currentAcc, currentDon, DNA, max_intron_size, read_size, splice_thresh):
-
-      def signum(a):
-          if a>0: 
-              return 1
-         elif a<0:
-              return -1
-          else:
-              return 0
-
-      proximal_acc   = []
-      for idx in xrange(max_intron_size, max_intron_size+read_size/2):
-          if currentAcc[idx]>= splice_thresh:
-            proximal_acc.append((idx,currentAcc[idx]))
-
-      proximal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
-      proximal_acc=proximal_acc[-2:]
-
-      distal_acc   = []
-      for idx in xrange(max_intron_size+read_size, len(currentAcc)):
-          if currentAcc[idx]>= splice_thresh and idx+read_size<len(currentAcc):
-            distal_acc.append((idx, currentAcc[idx], DNA[idx+1:idx+read_size]))
-
-      #distal_acc.sort(lambda x,y: signum(x[1]-y[1])) 
-      #distal_acc=distal_acc[-2:]
-
-      proximal_don   = []
-      for idx in xrange(max_intron_size+read_size/2, max_intron_size+read_size):
-         if currentDon[idx] >= splice_thresh:
-            proximal_don.append((idx, currentDon[idx]))
-
-      proximal_don.sort(lambda x,y: signum(x[1]-y[1]))
-      proximal_don=proximal_don[-2:]
-
-      distal_don   = []
-      for idx in xrange(1, max_intron_size):
-         if currentDon[idx] >= splice_thresh and idx>read_size:
-            distal_don.append((idx, currentDon[idx], DNA[idx-read_size:idx]))
-
-      distal_don.sort(lambda x,y: y[0]-x[0])
-      #distal_don=distal_don[-2:]
-
-      return proximal_acc,proximal_don,distal_acc,distal_don
-
-
-   def calcAlternativeAlignments(self,location):
-      """
-      Given an alignment proposed by Vmatch this function calculates possible
-      alternative alignments taking into account for example matched
-      donor/acceptor positions.
-      """
-
-      run = self.run
-
-      id       = location['id']
-      chr      = location['chr']
-      pos      = location['pos']
-      strand   = location['strand']
-      original_est = location['seq']
-      quality      = location['prb']
-      quality        = map(lambda x:ord(x)-self.prb_offset,quality)
-      
-      original_est = original_est.lower()
-      est = unbracket_seq(original_est)
-      effective_len = len(est)
-
-      genomicSeq_start  = pos - self.max_intron_size
-      genomicSeq_stop   = pos + self.max_intron_size + len(est)
-
-      strand_map = {'D':'+', 'P':'-'}
-      strand = strand_map[strand]
-
-      if strand == '-':
-         est = reverse_complement(est)
-         original_est = reverse_complement(original_est)
-
-      start = cpu()
-      #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop, run['dna_flat_files'])
-      currentDNASeq, currentAcc, currentDon = self.lt1.get_seq_and_scores(chr, strand, genomicSeq_start, genomicSeq_stop)
-      stop = cpu()
-      self.get_time += stop-start
-      dna   = currentDNASeq
-
-      proximal_acc,proximal_don,distal_acc,distal_don = self.findHighestScoringSpliceSites(currentAcc, currentDon, dna, self.max_intron_size, len(est), self.splice_thresh)
-       
-      alternativeScores = []
-
-      #pdb.set_trace()
-      
-      # inlined
-      h = self.h
-      d = self.d
-      a = self.a
-      mmatrix = self.mmatrix
-      qualityPlifs = self.qualityPlifs
-      # inlined
-
-      # find an intron on the 3' end
-      _start = cpu()
-      for (don_pos,don_score) in proximal_don:
-          DonorScore = calculatePlif(d, [don_score])[0]
-
-          for (acc_pos,acc_score,acc_dna) in distal_acc:
-
-              IntronScore = calculatePlif(h, [acc_pos-don_pos])[0] 
-              AcceptorScore = calculatePlif(a, [acc_score])[0] 
-          
-              #print 'don splice: ', (don_pos,don_score), (acc_pos,acc_score,acc_dna), (DonorScore,IntronScore,AcceptorScore)
-
-              # construct a new "original_est"
-              original_est_cut=''
-              
-              est_ptr=0
-              dna_ptr=self.max_intron_size
-              ptr=0
-              acc_dna_ptr=0
-              num_mismatch = 0
-              
-              while ptr<len(original_est):
-                  #print acc_dna_ptr,len(acc_dna),acc_pos,don_pos
-                  
-                  if original_est[ptr]=='[':
-                      dnaletter=original_est[ptr+1]
-                      estletter=original_est[ptr+2]
-                      if dna_ptr < don_pos:
-                          original_est_cut+=original_est[ptr:ptr+4]
-                          num_mismatch += 1
-                      else:
-                          if acc_dna[acc_dna_ptr]==estletter:
-                              original_est_cut += estletter # EST letter
-                          else:
-                              original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
-                              num_mismatch += 1
-                              #print '['+acc_dna[acc_dna_ptr]+estletter+']' 
-                          acc_dna_ptr+=1 
-                      ptr+=4 
-                  else:
-                      dnaletter=original_est[ptr]
-                      estletter=dnaletter
-                
-                      if dna_ptr < don_pos:
-                          original_est_cut+=estletter # EST letter
-                      else:
-                          if acc_dna[acc_dna_ptr]==estletter:
-                              original_est_cut += estletter # EST letter
-                          else:
-                              num_mismatch += 1
-                              original_est_cut += '['+acc_dna[acc_dna_ptr]+estletter+']' # EST letter
-                              #print '('+acc_dna[acc_dna_ptr]+estletter+')' 
-                          acc_dna_ptr+=1 
-                          
-                      ptr+=1
-
-                  if estletter=='-':
-                      dna_ptr+=1 
-                  elif dnaletter=='-':
-                      est_ptr+=1
-                  else:
-                      dna_ptr+=1 
-                      est_ptr+=1
-              if num_mismatch>self.max_mismatch:
-                  continue
-                         
-              assert(dna_ptr<=len(dna))
-              assert(est_ptr<=len(est))
-
-              #print original_est, original_est_cut              
-
-              score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
-              score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias 
-         
-              alternativeScores.append(score)
-
-              if acc_score>=self.splice_stop_thresh:
-                  break
-              
-      #pdb.set_trace()
-
-      _stop = cpu()
-      self.alternativeScoresTime += _stop-_start
-
-      # find an intron on the 5' end 
-      _start = cpu()
-      for (acc_pos,acc_score) in proximal_acc:
-
-          AcceptorScore = calculatePlif(a, [acc_score])[0]
-          
-          for (don_pos,don_score,don_dna) in distal_don:
-
-              DonorScore = calculatePlif(d, [don_score])[0]
-              IntronScore = calculatePlif(h, [acc_pos-don_pos])[0] 
-          
-              #print 'acc splice: ', (don_pos,don_score,don_dna), (acc_pos,acc_score), (DonorScore,IntronScore,AcceptorScore)
-
-              # construct a new "original_est"
-              original_est_cut=''
-              
-              est_ptr=0
-              dna_ptr=self.max_intron_size
-              ptr=0
-              num_mismatch = 0
-              don_dna_ptr=len(don_dna)-(acc_pos-self.max_intron_size)-1
-              while ptr<len(original_est):
-                  
-                  if original_est[ptr]=='[':
-                      dnaletter=original_est[ptr+1]
-                      estletter=original_est[ptr+2]
-                      if dna_ptr > acc_pos:
-                          original_est_cut+=original_est[ptr:ptr+4] 
-                          num_mismatch += 1
-                      else:
-                          if don_dna[don_dna_ptr]==estletter:
-                              original_est_cut += estletter # EST letter
-                          else:
-                              original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
-                              num_mismatch += 1
-                              #print '['+don_dna[don_dna_ptr]+estletter+']' 
-                          don_dna_ptr+=1 
-                      ptr+=4 
-                  else:
-                      dnaletter=original_est[ptr]
-                      estletter=dnaletter
-                
-                      if dna_ptr > acc_pos:
-                          original_est_cut+=estletter # EST letter
-                      else:
-                          if don_dna[don_dna_ptr]==estletter:
-                              original_est_cut += estletter # EST letter
-                          else:
-                              original_est_cut += '['+don_dna[don_dna_ptr]+estletter+']' # EST letter
-                              num_mismatch += 1
-                              #print '('+don_dna[don_dna_ptr]+estletter+')' 
-                          don_dna_ptr+=1 
-                          
-                      ptr+=1
-
-                  if estletter=='-':
-                      dna_ptr+=1 
-                  elif dnaletter=='-':
-                      est_ptr+=1
-                  else:
-                      dna_ptr+=1 
-                      est_ptr+=1
-                         
-              if num_mismatch>self.max_mismatch:
-                  continue
-              assert(dna_ptr<=len(dna))
-              assert(est_ptr<=len(est))
-
-              #print original_est, original_est_cut              
-
-              score = computeSpliceAlignScoreWithQuality(original_est_cut, quality, qualityPlifs, run, self.currentPhi)
-              score += AcceptorScore + IntronScore + DonorScore + self.spliced_bias
-         
-              alternativeScores.append(score)
-
-              if don_score>=self.splice_stop_thresh:
-                  break
-              
-      _stop = cpu()
-      self.alternativeScoresTime += _stop-_start
-
-      return alternativeScores
-
-
-   def calcAlignmentScore(self,alignment):
-      """
-      Given an alignment (dna,exons,est) and the current parameter for QPalma
-      this function calculates the dot product of the feature representation of
-      the alignment and the parameter vector i.e the alignment score. 
-      """
-
-      start = cpu()
-      run = self.run
-
-      # Lets start calculation
-      dna, exons, est, original_est, quality, acc_supp, don_supp = alignment
-
-      score = computeSpliceAlignScoreWithQuality(original_est, quality, self.qualityPlifs, run, self.currentPhi)
-
-      stop = cpu()
-      self.calcAlignmentScoreTime += stop-start
-      
-      return score 
diff --git a/scripts/QPalmaPipeline.py b/scripts/QPalmaPipeline.py
deleted file mode 100644 (file)
index 58758b4..0000000
+++ /dev/null
@@ -1,139 +0,0 @@
-#!/usr/bin/env python
-# -*- coding:latin-1 -*-
-
-# first step 
-
-import qpalma.Configuration as Conf
-
-from pipeline.Pipeline import *
-
-import os.path
-import cPickle
-
-from PipelineHeuristic import *
-
-import glob
-import time
-
-
-def get_most_recent_param(current_dir):
-   date_file_list = []
-   for file in glob.glob(current_dir + '/param*'):
-      stats = os.stat(file)
-      lastmod_date = time.localtime(stats[8])
-      date_file_tuple = lastmod_date, file
-      date_file_list.append(date_file_tuple)
-
-   date_file_list.sort()
-   date_file_list.reverse()
-
-   return date_file_list[0][1]
-
-
-class InitStep(PipelineStep):
-
-   def __init__(self):
-
-      result_dir = Conf.result_dir      
-
-
-class FirstStep(PipelineStep):
-   """
-   
-   """
-   
-   def __init__(self):
-      jp = os.path.join
-
-      # a list storing all commands for this particular step
-      cL = []
-
-
-      # fetch the config object which stores all relevant parameters
-      Config = cPickle.load(open(Conf.conf_object_path))
-
-      reads_0_1st = jp(Config['mapping_main_dir'],'reads_0.fl')
-      cmd = 'fix_indels.pl %s > %s' % (Config['reads_location'],reads_0_1st)
-      cL.append(cmd)
-
-      # ATTENTION: Changing working directory
-      os.chdir(Config['mapping_main_dir'])
-
-      cmd = 'map_full_transcript.pl --mismatches=%d --end_gap=%d -no_edit_distance --read_length=%d --repeat_mapping=%d --seedlength=%d --dir=. --suffixtree=%s' % (\
-      Config['mismatches_1'],\
-      Config['end_gap_1'],\
-      Config['read_size'],\
-      Config['repeat_mapping_1'],\
-      Config['seedlength_1'],\
-      Config['suffixtree_1'])
-
-      cL.append(cmd)
-
-      reads_0_2nd = jp(Config['mapping_spliced_dir'],'reads_0.fl')
-      cmd = 'add_quality_2_fl.pl reads_0.fl reads_3.fl > %s' % reads_0_2nd
-
-      cL.append(cmd)
-
-      # ATTENTION: Changing working directory
-      os.chdir(Config['mapping_spliced_dir'])
-
-      # ATTENTION: Whether we should use the filtering step is not clear
-      #cmd = 'filter_quality.pl 8 reads_0.fl && mv reads_0.fl.filtered reads_0.fl'
-      #cL.append(cmd)
-
-      cmd = 'map_spliced_transcript.pl --mismatches=%d --sub_mismatches=%d --read_length=%d --min_short_end=%d --repeat_mapping=%d --seedlength=%d --dir=. --suffixtree=%s' % (\
-      Config['mismatches_2'],\
-      Config['sub_mismatches_2'],\
-      Config['read_size'],\
-      Config['min_short_end_2'],\
-      Config['repeat_mapping_2'],\
-      Config['seedlength_2'],\
-      Config['suffixtree_2'])
-
-      cL.append(cmd)
-
-      for elem in cL:
-         print elem
-
-      PipelineStep.__init__(self,cL)
-
-
-class SecondStep(PipelineStep):
-   """
-   We can train 
-   """
-
-   pass
-
-
-class ThirdStep(PipelineStep):
-   """
-   We use the QPalma heuristic to predict those reads that should be aligned by
-   the QPalma alignment program even though vmatch found a full match with some
-   mismatches. 
-   """
-   jp = os.path.join
-
-   # fetch the config object which stores all relevant parameters
-   Config = cPickle.load(open(Conf.conf_object_path))
-
-   data_fn  = jp(Config['heuristic_dir'],'map.vm')
-   param_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
-   param_fn = get_most_recent_param(param_dir)
-   run_fn   = jp(param_dir,'run_obj.pickle')
-
-   result_spliced_fn    = jp(Config['heuristic_dir'],'spliced.heuristic')
-   result_unspliced_fn  = jp(Config['heuristic_dir'],'unspliced.heuristic')
-
-   ph1 = PipelineHeuristic(run_fn,data_fn,param_fn,result_spliced_fn,result_unspliced_fn)
-   ph1.filter()
-
-
-class FourthStep(PipelineStep):
-   """
-   
-   """
-
-
-if __name__ == '__main__':
-   step1 = FirstStep()
diff --git a/scripts/Run.py b/scripts/Run.py
deleted file mode 100644 (file)
index a9755bd..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env python
-# -*- coding:latin-1 -*-
-
-
-class Run(dict):
-   """
-   This class represents the needed parameters for one single run of an
-   algorithm. This means that if we for example perform a model selection over
-   C then for each value of C we have to create one Run object storing the
-   respective parameters.
-   """
-
-   def __init__(self):
-      pass
-
-   def __repr__(self):
-      
-      result = ""
-      for key,val in self.iteritems():
-         result += "%s:\t\t %s\n" % (key,str(val))
-
-      return result
-
-if __name__ == '__main__':
-   r1 = Run()
-   r1['attrib_1'] = 12
-   r1['attrib_2'] = 22
-
-   print "%s" % r1
-
diff --git a/scripts/__init__.py b/scripts/__init__.py
deleted file mode 100644 (file)
index e69de29..0000000
diff --git a/scripts/check_and_init.py b/scripts/check_and_init.py
deleted file mode 100644 (file)
index 27c9e19..0000000
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-#
-# 
-#
-#
-
-import os.path
-import cPickle
-
-import QPalmaConfiguration as Conf
-
-
-def check_vmatch_params(Config):
-   #
-   # Check and set parameters for the first VMatch step 
-   #
-
-   Config['mismatches_1']     = Conf.mismatches_1
-   Config['end_gap_1']        = Conf.end_gap_1
-   Config['read_length_1']    = Conf.read_length_1
-   Config['repeat_mapping_1'] = Conf.repeat_mapping_1
-   Config['seedlength_1']     = Conf.seedlength_1
-   Config['suffixtree_1']     = Conf.suffixtree_1
-
-   assert 0 <= Config['mismatches_1'] <= Config['read_size']
-   assert 0 <= Config['end_gap_1'] <= 5
-   assert 0 <= Config['repeat_mapping_1']  <= 10
-   assert 2 <= Config['seedlength_1'] <= Config['read_size']
-   assert os.path.exists( Config['suffixtree_1'] )
-
-   #
-   # Check and set parameters for the second VMatch step 
-   #
-
-   Config['mismatches_2']     = Conf.mismatches_2
-   Config['sub_mismatches_2'] = Conf.sub_mismatches_2
-   Config['min_short_end_2']  = Conf.min_short_end_2
-   Config['repeat_mapping_2'] = Conf.repeat_mapping_2
-   Config['seedlength_2']     = Conf.seedlength_2
-
-   assert 0 <= Config['mismatches_2'] <= Config['read_size']
-   assert 0 <= Config['sub_mismatches_2'] <= Config['read_size']
-   assert 0 <= Config['min_short_end_2'] <= Config['read_size']
-   assert 0 <= Config['repeat_mapping_2'] <= Config['read_size']
-   assert os.path.exists( Config['suffixtree_2'] )
-
-
-def check_and_init():
-   """
-   The purpose of this script is to take all the global variables from the
-   Configuration file and store them into a dictionary for the pipeline.
-
-   Additionally sanity checks are performed for the parameters to be sure they
-   are within a certain interval or the file they point to exists etc.
-   """
-
-   jp = os.path.join
-
-   # create a python dictionary to store all configuration parameters
-   Config = {}
-   
-   result_dir = Conf.result_dir
-   assert os.path.exists(result_dir), 'Error you have to specify an existing result_dir.'
-
-   # assing main result dir in Config dictionary
-   Config['result_dir'] = result_dir
-
-   Config['reads_location'] = Conf.reads_location
-   #assert os.path.exists(Config['reads_location'])
-
-   Config['read_size']     = Conf.read_size
-   assert 2 <= Config['read_size'] <= 100
-
-   # Check VMatch related parameters
-   check_vmatch_params(Config)
-
-   subdirs = []
-
-   # create subdirs needed for mapping / alignment / remapping
-   mapping_dir = jp(result_dir,'mapping')
-   Config['mapping_dir'] = mapping_dir
-   Config['mapping_main_dir']    = jp(mapping_dir,'main')
-   Config['mapping_spliced_dir'] = jp(mapping_dir,'spliced')
-
-   subdirs.extend([Config['mapping_dir'], Config['mapping_main_dir'],\
-   Config['mapping_spliced_dir']])
-
-   Config['alignment_dir']             = jp(result_dir,'alignment')
-
-   subdirs.extend([Config['alignment_dir']])
-
-   Config['remapping_dir'] = jp(result_dir,'remapping')
-
-   subdirs.extend([Config['remapping_dir']])
-   
-   #try:
-   #   for current_dir in subdirs:
-   #      os.mkdir(current_dir)
-   #except:
-   #   print 'Error during initialization of project directories'
-
-   # store the configuration in a pickled python dictionary
-   cPickle.dump(Config,open(Conf.conf_object_path,'w+'))
-
-
-if __name__ == '__main__':
-   check_and_init()
diff --git a/scripts/check_dataset.py b/scripts/check_dataset.py
deleted file mode 100644 (file)
index 411f727..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-#
-# The purpose of this script is to check the created dataset pickle files for
-# consistency before doing any kind of training/prediction on the data.
-#
-
-import sys
-import pdb
-import cPickle
-
-from qpalma.sequence_utils import get_seq_and_scores,unbracket_seq,reverse_complement
-
-
-def checkAll(filename):
-   """
-   This function loads the dataset and performs some sanity checks and
-   assertions to be sure that the set is in the right shape for QPalma to train
-   resp. to predict on.
-   """
-
-   dataset = cPickle.load(open(filename))
-
-   # we take the first quality vector of the tuple of quality vectors
-   quality_index = 0
-
-   status = True
-   mes = '---'
-
-   idx = 0
-   for example_key in dataset.keys():
-      matches = dataset[example_key]
-      print 'Current example %d has %d matches' % (example_key,len(matches))
-
-      for example in matches:
-         (currentSeqInfo,original_est,currentQualities) = example
-
-         (id,chromo,strand,genomicSeq_start,genomicSeq_stop) = currentSeqInfo
-
-         assert chromo in range(1,6), pdb.set_trace()
-         assert strand in ['+','-'], pdb.set_trace()
-
-         quality = currentQualities[quality_index]
-
-         # check for key consistency
-         assert id == example_key
-
-         if idx > 0 and idx % 1000 == 0:
-            print 'processed %d examples' % idx
-
-         dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
-         genomic_seq_pos,acc_pos,don_pos = get_seq_and_scores(chromo,'+',genomicSeq_start,genomicSeq_stop,dna_flat_files,False)
-         genomic_seq_neg,acc_neg,don_neg = get_seq_and_scores(chromo,'-',genomicSeq_start,genomicSeq_stop,dna_flat_files,False)
-
-         assert reverse_complement(genomic_seq_neg) == genomic_seq_pos
-
-
-   return status,mes
-
-
-if __name__ == '__main__':
-   dataset_fn = sys.argv[1]
-   status,mes = checkAll(dataset_fn )
-
-   if status == True:   
-      print 'Dataset %s seems to be consistent.' % dataset_fn
-   else:
-      print 'Dataset %s seems to be inconsistent!' % dataset_fn
-      print mes
diff --git a/scripts/createAlignmentFileFromPrediction.py b/scripts/createAlignmentFileFromPrediction.py
deleted file mode 100644 (file)
index f591ae2..0000000
+++ /dev/null
@@ -1,250 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*- 
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# Written (W) 2008 Fabio De Bona
-# Copyright (C) 2008 Max-Planck-Society
-
-import cPickle
-import math
-import numpy
-import os
-import os.path
-import pdb
-import sys
-
-from qpalma.utils import pprint_alignment
-
-import palma.palma_utils as pu
-from palma.output_formating import print_results
-
-from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
-
-# a little auxiliary function
-pp = lambda x : str(x)[1:-1].replace(' ','')
-
-
-def alignment_reconstruct(current_prediction,num_exons):
-   """
-   We reconstruct the exact alignment to infer the positions of indels and the
-   sizes of the respective exons.
-   """
-
-   translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char) 
-
-   SpliceAlign = current_prediction['spliceAlign']
-   estAlign    = current_prediction['estAlign']
-
-   dna_array   = current_prediction['dna_array']
-   read_array  = current_prediction['read_array']
-
-   dna_array   = "".join(map(lambda x: translation[int(x)],dna_array))
-   read_array  = "".join(map(lambda x: translation[int(x)],read_array))
-
-   # this array holds a number for each exon indicating its number of matches
-   exonIdentities = [0]*num_exons
-   exonGaps       = [0]*num_exons
-
-   gaps     = 0
-   identity = 0
-   idx      = 0
-
-   for ridx in range(len(read_array)):
-      read_elem = read_array[ridx]
-      dna_elem = dna_array[ridx]
-
-      if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
-         idx += 1
-         gaps = 0
-         identity = 0
-         
-      if read_elem == '_':
-         continue
-
-      if read_elem == dna_elem:
-         identity += 1
-
-      if read_elem == '-':
-         gaps += 1
-      
-      exonIdentities[idx] = identity
-      exonGaps[idx] = gaps
-   
-   return exonIdentities,exonGaps
-
-
-def create_alignment_file(current_loc,out_fname):
-
-   out_fh = open(out_fname,'w+')
-   allPredictions = cPickle.load(open(current_loc))
-   #allPredictions = current_loc
-
-   print 'Loaded %d examples' % len(allPredictions)
-
-   # fetch the data needed
-   accessWrapper = DataAccessWrapper(settings)
-   seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
-
-   # Uniqueness filtering using alignment score for reads with several
-   # alignments
-   allUniquePredictions = {}
-
-   for new_prediction in allPredictions:
-      id = new_prediction['id']
-      if allUniquePredictions.has_key(id):
-         current_prediction = allUniquePredictions[id]
-
-         current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
-         new_score =  new_prediction['DPScores'].flatten().tolist()[0][0]
-
-         if current_a_score < new_score :
-            allUniquePredictions[id] = new_prediction
-
-      else:
-         allUniquePredictions[id] = new_prediction
-
-   written_lines = 0
-   for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
-
-      id = current_prediction['id']
-
-      seq         = current_prediction['read']
-      dna         = current_prediction['dna']
-
-      # CHECK !!!
-      q1          = 'zzz'
-
-      chromo      = current_prediction['chr']
-      strand      = current_prediction['strand']
-      start_pos   = current_prediction['start_pos']
-      alignment   = current_prediction['alignment']
-
-      DPScores    = current_prediction['DPScores']
-      predExons   = current_prediction['predExons']
-      predExons   = [e+start_pos for e in predExons]
-
-      (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
-      tExonSizes,tStarts, tEnds) = alignment
-
-      if len(qExonSizes) != num_exons:
-         print 'BUG exon sizes %d'%id
-         continue
-
-      EST2GFF = False
-      new_line = ''
-
-      if EST2GFF:
-         predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
-
-         match = predExons[0,1] - predExons[0,0] 
-         if predExons.shape == (2,2):
-            match += predExons[1,1] - predExons[1,0] 
-
-         mismatch = 0
-         repmatch = 0
-         Ns = 0
-         Qgapcnt = 0
-         Qgapbs = 0
-
-         Tgapcnt = 0
-         Tgapbs = 0
-
-         if predExons.shape == (2,2):
-            Tgapbs += predExons[1,0] - predExons[0,1]
-            Tgapcnt = 1
-
-         # ATTENTION
-         #  
-         # we enforce equal exons sizes for q and t because we are missing indel
-         # positions of the alignment
-
-         if qExonSizes[0] != tExonSizes[0]:
-            continue
-
-         Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
-         Qsize = len(seq)
-         Qstart = qStart
-         Qend = qEnd
-         Tname = 'CHR%d'%chromo
-
-         start_pos -= 2
-
-         Tsize = tEnd+1 + start_pos
-         Tstart =  tStart + start_pos
-         Tend = tEnd + start_pos
-         blockcnt = Tgapcnt+1
-         exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
-         Qstarts_str = str(qStarts)[1:-1].replace(' ','')
-         Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
-
-         new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
-         % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
-         Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
-         Tname, Tsize, Tstart, Tend,\
-         blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
-
-      else:
-         exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
-
-
-         new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
-         (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
-         pp(qExonSizes),pp(qStarts),\
-         pp(qEnds),pp(tExonSizes),\
-         pp(tStarts),pp(tEnds),\
-         pp(exonIdentities),pp(exonGaps))
-
-      out_fh.write(new_line)
-      written_lines += 1
-
-   print 'Wrote out %d lines' % written_lines
-
-
-def run(chunk_dir,outfile):
-   """   
-   The run...
-   """
-
-   out_fh = open(outfile,'w+')
-
-   # fetch the data needed
-   accessWrapper = DataAccessWrapper(settings)
-   seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
-
-   for line in open(result_fn):
-      sl = line.split()
-
-      chromo = int(sl[1])
-      strand = sl[2]
-      start_pos = int(sl[5])-2
-      starts   = sl[15].split(',')
-      ends     = sl[16].split(',')
-
-      ids = sl[-2].split(',')
-      ids   = [int(elem) for elem in ids]
-      gaps = sl[-1].split(',')
-      gaps   = [int(elem) for elem in gaps]
-
-      if strand == '+':
-         pass
-      elif strand == '-':
-         total_size = seqInfo.chromo_sizes[chromo]
-         start_pos = total_size - start_pos
-      else:
-         assert False
-
-      starts   = [int(elem) + start_pos for elem in starts]
-      ends     = [int(elem) + start_pos for elem in ends]
-
-      if strand == '+':
-         starts   = [e+1 for e in starts]
-         ends     = [e+1 for e in ends]
-      else:
-         starts   = [e-3001 for e in starts]
-         ends     = [e-3001 for e in ends]
-
-      out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
diff --git a/scripts/createCoverageFigures.sh b/scripts/createCoverageFigures.sh
deleted file mode 100755 (executable)
index 3ed57c3..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-cat $1 | grep correct | cut -f 3 > correct.coverage_labels
-cat $1 | grep wrong | cut -f 3 > wrong.coverage_labels
-
-( cat <<EOF
-load correct.coverage_labels;
-load wrong.coverage_labels;
-correct_vec = zeros(10,1);
-wrong_vec = zeros(10,1);
-for i=1:10,
-   correct_vec(i) = sum([correct == (i-1)]);
-   wrong_vec(i) = sum([wrong == (i-1)]);
-end
-bar(correct_vec,1)
-hold;
-set(gca,'xlim',[0.5 length(correct_vec)+0.5])
-xlabel('Confirmation count')
-ylabel('Frequency')
-set(gca,'XTick',[0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5])
-set(gca,'XTickLabel',[0,1,2,3,4,5,6,7,8,9,10])
-saveas(gcf,'pos_cov.eps')
-clf;
-bar(wrong_vec,1);
-hold;
-set(gca,'xlim',[0.5 length(correct_vec)+0.5])
-xlabel('Confirmation count')
-ylabel('Frequency')
-set(gca,'XTick',[0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5])
-set(gca,'XTickLabel',[0,1,2,3,4,5,6,7,8,9,10])
-saveas(gcf,'neg_cov.eps')
-EOF
-) > test_script.m
-
-matlab -nojvm -r "test_script;" || echo -e "\nmatlab script failed!\n"
diff --git a/scripts/debugDataset.py b/scripts/debugDataset.py
deleted file mode 100644 (file)
index f87b3aa..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*-
-
-from qpalma.DataProc import *
-from compile_dataset import getSpliceScores, get_seq_and_scores
-
-def debugDataset():
-   filename = 'dataset_remapped_test_new'
-
-   SeqInfo, Exons, OriginalEsts, Qualities,\
-   AlternativeSequences = paths_load_data(filename,None,None,None)
-
-   beg = 0
-   end = 5000
-
-   SeqInfo     = SeqInfo[beg:end]
-   Exons       = Exons[beg:end]
-   OriginalEsts= OriginalEsts[beg:end]
-   Qualities   = Qualities[beg:end]
-   AlternativeSequences = AlternativeSequences[beg:end]
-
-   for exampleIdx in range(4579,4580):
-      currentSeqInfo = SeqInfo[exampleIdx]
-      chr,strand,up_cut,down_cut = currentSeqInfo 
-
-      dna_flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
-
-      currentAlternatives = AlternativeSequences[exampleIdx]
-      for alternative_alignment in currentAlternatives:
-         chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel = alternative_alignment
-         if not chr in range(1,6):
-            return 
-
-         print chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel
-         #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
-
-   pdb.set_trace()
-
-if __name__ == '__main__':
-   debugDataset()
-
diff --git a/scripts/est2gff.sh b/scripts/est2gff.sh
deleted file mode 100755 (executable)
index 0a3270d..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-
-g_config=/fml/ag-raetsch/share/databases/genomes/A_thaliana/arabidopsis_tair7/genebuild/genome.config
-
-# First we count the numbers of incorrect gt and gc positions for the full
-# alignment set not filtered by coverage numbers etc.
-
-alignment_file=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/full.align.consistent.unique
-
-touch gt_dpscores.hist && rm gt_dpscores.hist
-touch gc_dpscores.hist && rm gc_dpscores.hist
-
-for CHR in "CHR1" "CHR2" "CHR3" "CHR4" "CHR5"
-do
-   /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR + psl2gff.result psl2gff_2.result $g_config \
-   | grep -A1 '###' | grep -B1 'gt..\.' | grep check_psl >> gt_dpscores.hist
-   #| grep -A1 '###' | grep -B1 'gt..\.' | grep check_psl  |cut -f2 -d '(' | cut -f1 -d ')' >> gt_dpscores.hist
-
-   /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR + psl2gff.result psl2gff_2.result $g_config \
-   | grep -A1 '###' | grep -B1 'gc..\.' | grep check_psl >> gc_dpscores.hist
-   #| grep -A1 '###' | grep -B1 'gc..\.' | grep check_psl  |cut -f2 -d '(' | cut -f1 -d ')' >> gc_dpscores.hist
-done
-
-# Now we determine the numbers of incorrect gt and gc positions for the full
-# alignment set FILTERED by coverage numbers.
-
-alignment_file=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/full.align.coverage_filtered
-
-touch gt_dpscores.hist.coverage_filtered && rm gt_dpscores.hist.coverage_filtered 
-touch gc_dpscores.hist.coverage_filtered && rm gc_dpscores.hist.coverage_filtered 
-
-for CHR in "CHR1" "CHR2" "CHR3" "CHR4" "CHR5"
-do
-   /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR + psl2gff.result psl2gff_2.result $g_config \
-   | grep -A1 '###' | grep -B1 'gt..\.' | grep check_psl >> gt_dpscores.hist.coverage_filtered 
-   #| grep -A1 '###' | grep -B1 'gt..\.' | grep check_psl  |cut -f2 -d '(' | cut -f1 -d ')' >> gt_dpscores.hist.coverage_filtered 
-
-   /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR + psl2gff.result psl2gff_2.result $g_config \
-   | grep -A1 '###' | grep -B1 'gc..\.' | grep check_psl >> gc_dpscores.hist.coverage_filtered 
-   #| grep -A1 '###' | grep -B1 'gc..\.' | grep check_psl  |cut -f2 -d '(' | cut -f1 -d ')' >> gc_dpscores.hist.coverage_filtered 
-done
-
-cat gt_dpscores.hist | sort > gt_dpscores.hist.sorted
-cat gt_dpscores.hist.coverage_filtered | sort > gt_dpscores.hist.coverage_filtered.sorted
-diff  --suppress-common-lines gt_dpscores.hist.coverage_filtered.sorted gt_dpscores.hist.sorted > DIFF
diff --git a/scripts/est2gff_pipeline.sh b/scripts/est2gff_pipeline.sh
deleted file mode 100755 (executable)
index 926df1d..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-#
-# This script takes the alignment output of QPalma and uses the script psl2gff
-# to cluster the reads together in order to infer gene structures.
-# 
-# The result is a (are several) gff file(s) that are used as input to the
-# compute_splicegraph script of the genebuild project.
-#
-
-g_config=/fml/ag-raetsch/share/databases/genomes/A_thaliana/arabidopsis_tair7/genebuild/genome.config
-
-# First we count the numbers of incorrect gt and gc positions for the full
-# alignment set not filtered by coverage numbers etc.
-
-alignment_file=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/full.align.consistent.unique
-
-for CHR in "CHR1" "CHR2" "CHR3" "CHR4" "CHR5"
-do
-   for STRAND in "+" "-"
-   do
-      echo $CHR $STRAND
-      result1_fn=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/SpliceGraphResults/gff/test_myexons_${CHR}${STRAND}.gff
-      result2_fn=/dev/null
-   
-      /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR ${STRAND} $result1_fn $result2_fn $g_config 1>>LOG
-   done
-done
diff --git a/scripts/evaluateVmatch.py b/scripts/evaluateVmatch.py
deleted file mode 100644 (file)
index 1fd6432..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import pdb
-import os
-import os.path
-
-
-def compare_alignments(aln1,aln2):
-   return 
-   
-
-def compare_all_alignments(reads_fn,vmatch_result_dir):
-   read_positions = {}
-
-   vmatch_correct_ctr = 0
-
-   for line in open(reads_fn):
-      line = line.strip()
-      slist = line.split()
-      id = int(slist[0])
-      start_pos = int(slist[10])
-      end_pos = int(slist[13])
-
-      read_positions[id] = (start_pos,end_pos)
-
-   for line in open(vmatch_result_dir):
-      line = line.strip()
-      slist = line.split()
-   
-      id = int(slist[0])
-      vmatch_start_pos = int(slist[2])
-
-      start_pos,end_pos = read_positions[id]
-
-      if vmatch_start_pos == start_pos:
-         vmatch_correct_ctr += 1
-   
-
-if __name__ == '__main__':
-   reads_fn          = ''
-   vmatch_result_dir = ''
-   
diff --git a/scripts/grid_alignment.py b/scripts/grid_alignment.py
deleted file mode 100644 (file)
index 68c12c8..0000000
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*- 
-
-import cPickle
-import sys
-import time
-import pdb
-import os
-import os.path
-import math
-
-from pythongrid import KybJob, Usage
-from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
-
-from createAlignmentFileFromPrediction import create_alignment_file
-
-import grid_alignment
-
-
-def g_alignment(chunk_fn,result_fn):
-   create_alignment_file(chunk_fn,result_fn)
-
-
-def create_and_submit():
-   jp = os.path.join
-
-   run_dir     = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/prediction'
-   result_dir  = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/alignment'
-   
-   chunks_fn = []
-   for fn in os.listdir(run_dir):
-      if fn.startswith('chunk'):
-         chunks_fn.append(fn)
-
-   #chunks_fn = [\
-   #'chunk_9.predictions.pickle',\
-   #'chunk_14.predictions.pickle',\
-   #'chunk_24.predictions.pickle']
-
-   print chunks_fn
-
-   functionJobs=[]
-
-   for chunk_fn in chunks_fn:
-      chunk_name  = chunk_fn[:chunk_fn.find('.')]
-      result_fn   = jp(result_dir,'%s.align_remap'%chunk_name)
-      chunk_fn = jp(run_dir,chunk_fn) 
-
-      #pdb.set_trace()
-
-      current_job = KybJob(grid_alignment.g_alignment,[chunk_fn,result_fn])
-      current_job.h_vmem = '15.0G'
-      current_job.express = 'True'
-
-      print "job #1: ", current_job.nativeSpecification
-
-      functionJobs = [current_job]
-      #functionJobs.append(current_job)
-      (sid, jobids) = submit_jobs(functionJobs)
-      time.sleep(10)
-
-
-if __name__ == '__main__':
-   create_and_submit()
diff --git a/scripts/grid_heuristic.py b/scripts/grid_heuristic.py
deleted file mode 100644 (file)
index a925bbe..0000000
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*- 
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# Written (W) 2008 Fabio De Bona
-# Copyright (C) 2008 Max-Planck-Society
-
-import cPickle
-import sys
-import pdb
-import time
-import os
-import os.path
-import math
-
-from pythongrid import KybJob, Usage
-from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
-
-from PipelineHeuristic import *
-
-import grid_heuristic
-
-from Utils import split_file
-
-
-def g_heuristic(run_fname,data_fname,param_fname,result_fname):
-   #print run_fname,data_fname,param_fname,result_fname
-   ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_fname)
-   ph1.filter()
-
-   return 'finished filtering set %s.' % data_fname
-
-
-def create_and_submit():
-   jp = os.path.join
-
-   num_splits = 50
-
-   run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
-   #data_dir = '/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/'
-
-   data_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/main/'
-
-   run_fname      = jp(run_dir,'run_obj.pickle')
-
-   original_map_fname = jp(data_dir,'map.vm')
-
-   split_file(original_map_fname,data_dir,num_splits)
-
-   param_fname    = jp(run_dir,'param_526.pickle')
-
-   functionJobs=[]
-
-   for idx in range(0,num_splits):
-      data_fname     = jp(data_dir,'map.part_%d'%idx)
-      result_fname   = jp(data_dir,'map.vm.part_%d.heuristic'%idx)
-
-      #pdb.set_trace()
-
-      current_job = KybJob(grid_heuristic.g_heuristic,[run_fname,data_fname,param_fname,result_fname])
-      current_job.h_vmem = '25.0G'
-      #current_job.express = 'True'
-
-      print "job #1: ", current_job.nativeSpecification
-
-      functionJobs.append(current_job)
-      #break
-
-   (sid, jobids) = submit_jobs(functionJobs)
-   #print 'checking whether finished'
-   #while not get_status(sid, jobids):
-   #   time.sleep(7)
-   #print 'collecting jobs'
-   #retjobs = collect_jobs(sid, jobids, functionJobs)
-   #print "ret fields AFTER execution on cluster"
-   #for (i, job) in enumerate(retjobs):
-   #   print "Job #", i, "- ret: ", job.ret
-
-   #print '--------------'
-
-
-if __name__ == '__main__':
-   #split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)
-   create_and_submit()
diff --git a/scripts/grid_predict.py b/scripts/grid_predict.py
deleted file mode 100644 (file)
index c5ab75e..0000000
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import cPickle
-import sys
-import time
-import pdb
-import os
-import os.path
-import math
-
-from pythongrid import KybJob, Usage
-from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
-
-from qpalma_main import *
-
-import grid_predict
-
-
-def get_slices(dataset_size,num_nodes):
-   all_instances = []
-
-   part = dataset_size / num_nodes
-   begin = 0
-   end = 0
-   for idx in range(1,num_nodes+1):
-      
-      if idx == num_nodes:
-         begin = end
-         end   = dataset_size
-      else:
-         begin = end
-         end = begin+part
-
-      params = (begin,end)
-
-      all_instances.append(params)
-
-   return all_instances
-
-
-def makeJobs(run,dataset_fn,chunks,param_fn):
-   """
-   """
-
-   jobs=[]
-
-   # fetch the data needed
-   g_dir    = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-   acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-   don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
-
-   g_fmt = 'chr%d.dna.flat'
-   s_fmt = 'contig_%d%s'
-
-   num_chromo = 6
-
-   accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
-   seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
-
-   for c_name,current_chunk in chunks:
-      current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param_fn,seqInfo,c_name])
-      current_job.h_vmem = '20.0G'
-      #current_job.express = 'True'
-
-      print "job #1: ", current_job.nativeSpecification
-
-      jobs.append(current_job)
-
-   return jobs
-
-
-def create_and_submit():
-   """
-
-   """
-
-   jp = os.path.join
-
-   run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
-
-   run   = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
-   run['name'] = 'saved_run'
-
-   param_fn = jp(run_dir,'param_526.pickle')
-
-   #run['result_dir']    = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/prediction'
-   #dataset_fn           = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset/dataset_run_1.pickle.pickle'
-   #prediction_keys_fn   = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset/dataset_run_1.pickle.keys.pickle'
-
-   run['result_dir']    = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/prediction'
-   dataset_fn           = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/dataset_run_1.pickle'
-   prediction_keys_fn   = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/dataset_run_1.keys.pickle'
-
-   prediction_keys = cPickle.load(open(prediction_keys_fn))
-
-   print 'Found %d keys for prediction.' % len(prediction_keys)
-
-   num_splits = 50
-   #num_splits = 1
-   slices = get_slices(len(prediction_keys),num_splits)
-   chunks = []
-   for idx,slice in enumerate(slices):
-      #if idx != 0:
-         c_name = 'chunk_%d' % idx
-         chunks.append((c_name,prediction_keys[slice[0]:slice[1]]))
-
-   functionJobs = makeJobs(run,dataset_fn,chunks,param_fn)
-
-   sum = 0
-   for size in [len(elem) for name,elem in chunks]:
-      sum += size
-   
-   print 'Got %d job(s)' % len(functionJobs)
-
-   #print "output ret field in each job before sending it onto the cluster"
-   #for (i, job) in enumerate(functionJobs):
-   #   print "Job with id: ", i, "- ret: ", job.ret
-   #print ""
-   #print "sending function jobs to cluster"
-   #print ""
-
-   (sid, jobids) = submit_jobs(functionJobs)
-
-
-def g_predict(run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name):
-   """
-  
-   """
-
-   qp = QPalma(False)
-   qp.init_prediction(run,dataset_fn,prediction_keys,param_fn,seqInfo,set_name)
-   return 'finished prediction of set %s.' % set_name
-
-
-if __name__ == '__main__':
-   create_and_submit()
diff --git a/scripts/plot_error.py b/scripts/plot_error.py
deleted file mode 100644 (file)
index dc9155a..0000000
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-
-import cPickle
-import sys
-
-filename = sys.argv[1]
-
-predict = cPickle.load(open(filename))
-
-correct = {}
-off     = {}
-wrong = {}
-
-split_count = [0]*36
-
-for elem in predict:
-   current_elem = elem['realRestPos']
-
-   split_count[current_elem] += 1
-
-   if len(elem['predExons']) != 4:
-      
-      try:
-         wrong[current_elem] += 1
-      except:
-         wrong[current_elem] = 1
-
-   else:
-      e1b = elem['e1_b_off']
-      e1e = elem['e1_e_off']
-      e2b = elem['e2_b_off']
-      e2e = elem['e2_e_off']
-
-      if e1b == 0 and e1e == 0 and e2b == 0 and e2e == 0:
-         try:
-            correct[current_elem] += 1
-         except:
-            correct[current_elem] = 1
-      
-      if not (e1b == 0 and e1e == 0 and e2b == 0 and e2e == 0):
-         try:
-            off[current_elem] += 1
-         except:
-            off[current_elem] = 1
-
-import pylab
-
-print split_count
-
-y_values = []
-for overlap in range(36):
-   result = 0
-   try:
-      cv = off[overlap]
-   except:
-      cv = 0
-
-   result += cv
-
-   try:
-      cv = wrong[overlap]
-   except:
-      cv = 0
-
-   result += cv
-   
-   if split_count[overlap] > 0:
-      result = result/(1.0*split_count[overlap])
-   else:
-      result = 0
-
-   y_values.append(result)
-
-outfile = filename.replace('_allPredictions','')
-
-import numpy
-import pylab
-pylab.bar(range(36),y_values,width=0.2,align='center')
-pylab.xticks( numpy.arange(0,36,1) )
-pylab.savefig('%s_error_vs_overlap.eps'%outfile)
-pylab.clf()
-#raw_input()
diff --git a/scripts/predictAll.sh b/scripts/predictAll.sh
deleted file mode 100755 (executable)
index 407efbc..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-
-instance_ctr=1
-
-for instance in `ls -1 run*.pickle`
-do
-   instance_dir=${instance/\.pickle/}
-
-   rt="/fml/ag-raetsch/home/fabio/tmp/QPalma"
-   newest_param=`ls -t ${rt}/${instance_dir}/param_* | head -n1`
-   echo $newest_param
-
-   script=predictAll_${instance_ctr}.py
-
-   echo "from qpalma_main import QPalma"  >> $script
-   echo "import sys"                      >> $script
-   echo "from cPickle import *"           >> $script
-   echo "import io_pickle"                >> $script
-   echo "dataset = io_pickle.load('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_20k')" >> $script
-
-   echo "ma=load(open('${instance}'))"          >> $script
-   echo "ma.run['dataset_filename'] = dataset"  >> $script
-
-   #echo "ma.run['training_begin'] = 0" >> $script
-   #echo "ma.run['training_end'] = 100" >> $script
-
-   echo "ma.run['prediction_begin'] = 0"        >> $script
-   echo "ma.run['prediction_end'] = 20000"      >> $script
-
-   echo "ma.run['min_intron_len'] = 20"         >> $script
-   echo "ma.run['max_intron_len'] = 2000"       >> $script
-   echo "ma.run['min_svm_score'] = 0.0 "        >> $script
-   echo "ma.run['max_svm_score'] = 1.0"         >> $script
-   echo "ma.run['min_qual'] = -5"               >> $script
-   echo "ma.run['max_qual'] = 40"               >> $script
-   echo "qpalma1 = QPalma(ma.run)"              >> $script
-   echo "qpalma1.evaluate('${newest_param}',dataset)" >> $script
-
-   script_startup=${script}_startup
-   
-   echo "source ~/.bashrc"  >> $script_startup
-   echo "export ILOG_LICENSE_FILE=/fml/ag-raetsch/share/software/ilog/ilm/access2.ilm" >> $script_startup
-   echo "python $script" >> $script_startup
-
-
-   echo python $script_startup | qsub -l h_vmem=3.0G -cwd -j y -N inst_${instance}.log
-
-   instance_ctr=$((instance_ctr+1))
-done
diff --git a/scripts/qpalma_main.py b/scripts/qpalma_main.py
deleted file mode 100644 (file)
index b3349be..0000000
+++ /dev/null
@@ -1,668 +0,0 @@
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# Written (W) 2008 Fabio De Bona
-# Copyright (C) 2008 Max-Planck-Society
-
-import array
-import cPickle
-import os.path
-import pdb
-import sys
-
-from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
-
-import numpy
-from numpy.matlib import mat,zeros,ones,inf
-from numpy.linalg import norm
-
-#from qpalma.SIQP_CPX import SIQPSolver
-#from qpalma.SIQP_CVXOPT import SIQPSolver
-
-import QPalmaDP
-import qpalma
-from qpalma.computeSpliceWeights import *
-from qpalma.set_param_palma import *
-from qpalma.computeSpliceAlignWithQuality import *
-from qpalma.TrainingParam import Param
-from qpalma.Plif import Plf,compute_donacc
-
-from qpalma.utils import pprint_alignment, get_alignment
-
-class SpliceSiteException:
-   pass
-
-
-def getData(training_set,exampleKey,run):
-   """ This function...  """
-
-   currentSeqInfo,original_est,currentQualities,currentExons = training_set[exampleKey]
-   id,chr,strand,up_cut,down_cut = currentSeqInfo
-
-   est = original_est
-   est = "".join(est)
-   est = est.lower()
-   est = unbracket_est(est)
-   est = est.replace('-','')
-
-   assert len(est) == run['read_size'], pdb.set_trace()
-   est_len = len(est)
-
-   #original_est = OriginalEsts[exampleIdx]
-   original_est = "".join(original_est)
-   original_est = original_est.lower()
-
-   dna_flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-   dna, acc_supp, don_supp = get_seq_and_scores(chr,strand,up_cut,down_cut,dna_flat_files)
-
-   #currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
-
-   original_exons = currentExons
-   exons = original_exons - (up_cut-1)
-   exons[0,0] -= 1
-   exons[1,0] -= 1
-
-   if exons.shape == (2,2):
-      fetched_dna_subseq = dna[exons[0,0]:exons[0,1]] + dna[exons[1,0]:exons[1,1]]
-     
-      donor_elem = dna[exons[0,1]:exons[0,1]+2]
-      acceptor_elem = dna[exons[1,0]-2:exons[1,0]]
-
-      if not ( donor_elem == 'gt' or donor_elem == 'gc' ):
-         print 'invalid donor in example %d'% exampleKey
-         raise SpliceSiteException
-
-      if not ( acceptor_elem == 'ag' ):
-         print 'invalid acceptor in example %d'% exampleKey
-         raise SpliceSiteException
-
-      assert len(fetched_dna_subseq) == len(est), pdb.set_trace()
-
-   return dna,est,acc_supp,don_supp,exons,original_est,currentQualities
-
-
-class QPalma:
-   """
-   This class wraps the training and prediction functions for 
-   the alignment.
-   """
-   
-   def __init__(self,run,seqInfo,dmode=False):
-      self.ARGS = Param()
-      self.qpalma_debug_mode = dmode
-      self.run = run
-      self.seqInfo = seqInfo
-
-
-   def plog(self,string):
-      self.logfh.write(string)
-      self.logfh.flush()
-
-
-   def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
-      """
-      Given the needed input this method calls the QPalma C module which
-      calculates a dynamic programming in order to obtain an alignment
-      """
-
-      dna_len = len(dna)
-      est_len = len(est)
-
-      prb = QPalmaDP.createDoubleArrayFromList(quality)
-      chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
-
-      matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
-      mm_len = self.run['matchmatrixRows']*self.run['matchmatrixCols']
-
-      d_len = len(donor)
-      donor = QPalmaDP.createDoubleArrayFromList(donor)
-      a_len = len(acceptor)
-      acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
-
-      # Create the alignment object representing the interface to the C/C++ code.
-      currentAlignment = QPalmaDP.Alignment(self.run['numQualPlifs'],self.run['numQualSuppPoints'], self.use_quality_scores)
-      c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
-      # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
-      currentAlignment.myalign( current_num_path, dna, dna_len,\
-       est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
-       acceptor, a_len, c_qualityPlifs, self.run['remove_duplicate_scores'],\
-       self.run['print_matrix'] )
-
-      if prediction_mode:
-         # part that is only needed for prediction
-         result_len = currentAlignment.getResultLength()
-         dna_array,est_array = currentAlignment.getAlignmentArraysNew()
-      else:
-         dna_array = None
-         est_array = None
-
-      newSpliceAlign, newEstAlign, newWeightMatch, newDPScores, newQualityPlifsFeatures =\
-      currentAlignment.getAlignmentResultsNew()
-
-      return newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
-      newQualityPlifsFeatures, dna_array, est_array
-
-
-   def init_train(self,training_set):
-      full_working_path = os.path.join(self.run['alignment_dir'],self.run['name'])
-
-      #assert not os.path.exists(full_working_path)
-      if not os.path.exists(full_working_path):
-         os.mkdir(full_working_path)
-
-      assert os.path.exists(full_working_path)
-
-      # ATTENTION: Changing working directory
-      os.chdir(full_working_path)
-
-      self.logfh = open('_qpalma_train.log','w+')
-      cPickle.dump(self.run,open('run_obj.pickle','w+'))
-
-      self.plog("Settings are:\n")
-      self.plog("%s\n"%str(self.run))
-
-      if self.run['mode'] == 'normal':
-         self.use_quality_scores = False
-
-      elif self.run['mode'] == 'using_quality_scores':
-         self.use_quality_scores = True
-      else:
-         assert(False)
-
-
-   def setUpSolver(self):
-      # Initialize solver 
-      self.plog('Initializing problem...\n')
-      
-      try:
-         solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
-      except:
-         self.plog('Got no license. Telling queue to reschedule job...\n')
-         sys.exit(99)
-
-      solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
-      solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
-
-      return solver
-
-
-   def train(self,training_set):
-      """
-      The mainloop for training.
-      """
-
-      numExamples = len(training_set)
-      self.plog('Number of training examples: %d\n'% numExamples)
-
-      self.noImprovementCtr = 0
-      self.oldObjValue = 1e8
-
-      iteration_steps         = self.run['iter_steps']
-      remove_duplicate_scores = self.run['remove_duplicate_scores']
-      print_matrix            = self.run['print_matrix']
-      anzpath                 = self.run['anzpath']
-
-      # Initialize parameter vector
-      param = numpy.matlib.rand(run['numFeatures'],1)
-   
-      lengthSP    = self.run['numLengthSuppPoints']
-      donSP       = self.run['numDonSuppPoints']
-      accSP       = self.run['numAccSuppPoints']
-      mmatrixSP   = self.run['matchmatrixRows']*run['matchmatrixCols']
-      numq        = self.run['numQualSuppPoints']
-      totalQualSP = self.run['totalQualSuppPoints']
-
-      # no intron length model
-      if not self.run['enable_intron_length']:
-         param[:lengthSP] *= 0.0
-
-      # Set the parameters such as limits penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
-
-      solver = self.setUpSolver()
-
-      # stores the number of alignments done for each example (best path, second-best path etc.)
-      num_path = [anzpath]*numExamples
-
-      # stores the gap for each example
-      gap      = [0.0]*numExamples
-
-      currentPhi = zeros((run['numFeatures'],1))
-      totalQualityPenalties = zeros((totalQualSP,1))
-
-      numConstPerRound = run['numConstraintsPerRound']
-      solver_call_ctr = 0
-
-      suboptimal_example = 0
-      iteration_nr = 0
-      param_idx = 0
-      const_added_ctr = 0
-
-      featureVectors = zeros((run['numFeatures'],numExamples))
-
-      self.plog('Starting training...\n')
-      # the main training loop
-      while True:
-         if iteration_nr == iteration_steps:
-            break
-
-         for exampleIdx,example_key in enumerate(training_set.keys()):
-            print 'Current example %d' % example_key
-            try:
-               dna,est,acc_supp,don_supp,exons,original_est,currentQualities =\
-               getData(training_set,example_key,run)
-            except SpliceSiteException:
-               continue
-
-            dna_len = len(dna)
-
-            if run['enable_quality_scores']:
-               quality = currentQualities[quality_index]
-            else:
-               quality = [40]*len(read)
-
-            if not run['enable_splice_signals']:
-               for idx,elem in enumerate(don_supp):
-                  if elem != -inf:
-                     don_supp[idx] = 0.0
-
-               for idx,elem in enumerate(acc_supp):
-                  if elem != -inf:
-                     acc_supp[idx] = 0.0
-
-            # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-            if run['mode'] == 'using_quality_scores':
-               trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
-               computeSpliceAlignWithQuality(dna, exons, est, original_est,\
-               quality, qualityPlifs,run)
-            else:
-               trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
-
-            dna_calc = dna_calc.replace('-','')
-
-            #print 'right before computeSpliceWeights exampleIdx %d' % exampleIdx
-            # Calculate the weights
-            trueWeightDon, trueWeightAcc, trueWeightIntron =\
-            computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
-            trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
-
-            currentPhi[0:lengthSP]                                            = mat(h.penalties[:]).reshape(lengthSP,1)
-            currentPhi[lengthSP:lengthSP+donSP]                               = mat(d.penalties[:]).reshape(donSP,1)
-            currentPhi[lengthSP+donSP:lengthSP+donSP+accSP]                   = mat(a.penalties[:]).reshape(accSP,1)
-            currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP]   = mmatrix[:]
-
-            if run['mode'] == 'using_quality_scores':
-               totalQualityPenalties = param[-totalQualSP:]
-               currentPhi[lengthSP+donSP+accSP+mmatrixSP:]                    = totalQualityPenalties[:]
-
-            # Calculate w'phi(x,y) the total score of the alignment
-            trueAlignmentScore = (trueWeight.T * currentPhi)[0,0]
-
-            # The allWeights vector is supposed to store the weight parameter
-            # of the true alignment as well as the weight parameters of the
-            # num_path[exampleIdx] other alignments
-            allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
-            allWeights[:,0] = trueWeight[:,0]
-
-            AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
-            AlignmentScores[0] = trueAlignmentScore
-
-            ################## Calculate wrong alignment(s) ######################
-            # Compute donor, acceptor with penalty_lookup_new
-            # returns two double lists
-            donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
-            ps = h.convert2SWIG()
-
-            newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
-            newQualityPlifsFeatures, unneeded1, unneeded2 =\
-            self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
-            mm_len = run['matchmatrixRows']*run['matchmatrixCols']
-
-            newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
-            newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
-
-            newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
-            # Calculate weights of the respective alignments. Note that we are calculating n-best alignments without 
-            # hamming loss, so we have to keep track which of the n-best alignments correspond to the true one in order 
-            # not to incorporate a true alignment in the
-            # constraints. To keep track of the true and false alignments we
-            # define an array true_map with a boolean indicating the
-            # equivalence to the true alignment for each decoded alignment.
-            true_map = [0]*(num_path[exampleIdx]+1)
-            true_map[0] = 1
-
-            for pathNr in range(num_path[exampleIdx]):
-               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
-               h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
-               acc_supp)
-              
-               decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
-               decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
-               # Gewichte in restliche Zeilen der Matrix speichern
-               allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
-
-               hpen = mat(h.penalties).reshape(len(h.penalties),1)
-               dpen = mat(d.penalties).reshape(len(d.penalties),1)
-               apen = mat(a.penalties).reshape(len(a.penalties),1)
-               features = numpy.vstack([hpen, dpen, apen, mmatrix[:], totalQualityPenalties[:]])
-
-               featureVectors[:,exampleIdx] = allWeights[:,pathNr+1]
-
-               AlignmentScores[pathNr+1] = (allWeights[:,pathNr+1].T * features)[0,0]
-
-               distinct_scores = False
-               if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
-                  distinct_scores = True
-
-               # Check wether scalar product + loss equals viterbi score
-               if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
-                  self.plog("Scalar prod. + loss not equals Viterbi output!\n")
-                  pdb.set_trace()
-
-               self.plog(" scalar prod (correct) : %f\n"%AlignmentScores[0])
-               self.plog(" scalar prod (pred.) : %f %f\n"%(newDPScores[pathNr,0],AlignmentScores[pathNr+1]))
-
-               # if the pathNr-best alignment is very close to the true alignment consider it as true
-               if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
-                  true_map[pathNr+1] = 1
-
-               if not trueAlignmentScore <= max(AlignmentScores[1:]) + 1e-6:
-                  print "suboptimal_example %d\n" %exampleIdx
-                  #trueSpliceAlign, trueWeightMatch, trueWeightQuality dna_calc=\
-                  #computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs)
-
-                  #pdb.set_trace()
-                  suboptimal_example += 1
-                  self.plog("suboptimal_example %d\n" %exampleIdx)
-
-               # the true label sequence should not have a larger score than the maximal one WHYYYYY?
-               # this means that all n-best paths are to close to each other 
-               # we have to extend the n-best search to a (n+1)-best
-               if len([elem for elem in true_map if elem == 1]) == len(true_map):
-                  num_path[exampleIdx] = num_path[exampleIdx]+1
-
-               # Choose true and first false alignment for extending
-               firstFalseIdx = -1
-               for map_idx,elem in enumerate(true_map):
-                  if elem == 0:
-                     firstFalseIdx = map_idx
-                     break
-
-               if False:
-                  self.plog("Is considered as: %d\n" % true_map[1])
-
-                  #result_len = currentAlignment.getResultLength()
-
-                  dna_array,est_array = currentAlignment.getAlignmentArraysNew()
-
-                  _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
-                  _newEstAlign = newEstAlign[0].flatten().tolist()[0]
-
-               # if there is at least one useful false alignment add the
-               # corresponding constraints to the optimization problem
-               if firstFalseIdx != -1:
-                  firstFalseWeights = allWeights[:,firstFalseIdx]
-                  differenceVector  = trueWeight - firstFalseWeights
-                  #pdb.set_trace()
-
-                  const_added = solver.addConstraint(differenceVector, exampleIdx)
-                  const_added_ctr += 1
-
-               # end of one example processing 
-
-            # call solver every nth example //added constraint
-            if exampleIdx != 0 and exampleIdx % numConstPerRound == 0:
-               objValue,w,self.slacks = solver.solve()
-               solver_call_ctr += 1
-
-               if solver_call_ctr == 5:
-                  numConstPerRound = 200
-                  self.plog('numConstPerRound is now %d\n'% numConstPerRound)
-
-               if math.fabs(objValue - self.oldObjValue) <= 1e-6:
-                  self.noImprovementCtr += 1
-
-               if self.noImprovementCtr == numExamples+1:
-                  break
-
-               self.oldObjValue = objValue
-               print "objValue is %f" % objValue
-
-               sum_xis = 0
-               for elem in self.slacks:
-                  sum_xis +=  elem
-
-               print 'sum of slacks is %f'% sum_xis 
-               self.plog('sum of slacks is %f\n'% sum_xis)
-   
-               for i in range(len(param)):
-                  param[i] = w[i]
-
-               cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
-               param_idx += 1
-               [h,d,a,mmatrix,qualityPlifs] =\
-               set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
-
-         ##############################################
-         # end of one iteration through all examples  #
-         ##############################################
-
-         self.plog("suboptimal rounds %d\n" %suboptimal_example)
-
-         if self.noImprovementCtr == numExamples*2:
-            FinalizeTraining(param,'param_%d.pickle'%param_idx)
-
-         iteration_nr += 1
-
-      #
-      # end of optimization 
-      #  
-      FinalizeTraining(param,'param_%d.pickle'%param_idx)
-
-
-   def FinalizeTraining(self,vector,name):
-      self.plog("Training completed")
-      cPickle.dump(param,open(name,'w+'))
-      self.logfh.close()
-   
-
-###############################################################################
-#
-# End of the code needed for training 
-#
-# Begin of code for prediction
-#
-###############################################################################
-
-
-   def init_prediction(self,dataset_fn,prediction_keys,param_fn,set_name):
-      """
-      Performing a prediction takes...
-      """
-      self.set_name = set_name
-
-      #full_working_path = os.path.join(run['alignment_dir'],run['name'])
-      full_working_path = self.run['result_dir']
-
-      print 'full_working_path is %s' % full_working_path 
-
-      #assert not os.path.exists(full_working_path)
-      if not os.path.exists(full_working_path):
-         os.mkdir(full_working_path)
-
-      assert os.path.exists(full_working_path)
-
-      # ATTENTION: Changing working directory
-      os.chdir(full_working_path)
-
-      self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
-
-      # number of prediction instances
-      self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
-
-      # load dataset and fetch instances that shall be predicted
-      dataset = cPickle.load(open(dataset_fn))
-
-      prediction_set = {}
-      for key in prediction_keys:
-         prediction_set[key] = dataset[key]
-
-      # we do not need the full dataset anymore
-      del dataset
-   
-      # Load parameter vector to predict with
-      param = cPickle.load(open(param_fn))
-
-      self.predict(prediction_set,param)
-
-
-   def predict(self,prediction_set,param):
-      """
-      This method...
-      """
-
-      if self.run['mode'] == 'normal':
-         self.use_quality_scores = False
-
-      elif self.run['mode'] == 'using_quality_scores':
-         self.use_quality_scores = True
-      else:
-         assert(False)
-
-      # Set the parameters such as limits/penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] =\
-      set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
-
-      if not self.qpalma_debug_mode:
-         self.plog('Starting prediction...\n')
-
-      self.problem_ctr = 0
-
-      # where we store the predictions
-      allPredictions = []
-
-      # we take the first quality vector of the tuple of quality vectors
-      quality_index = 0
-
-      # beginning of the prediction loop
-      for example_key in prediction_set.keys():
-         print 'Current example %d' % example_key
-         for example in prediction_set[example_key]:
-
-            currentSeqInfo,read,currentQualities = example
-
-            id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
-            currentSeqInfo 
-
-            if not self.qpalma_debug_mode:
-               self.plog('Loading example id: %d...\n'% int(id))
-
-            if self.run['enable_quality_scores']:
-               quality = currentQualities[quality_index]
-            else:
-               quality = [40]*len(read)
-
-            try:
-               currentDNASeq, currentAcc, currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
-            except:
-               self.problem_ctr += 1
-               continue
-
-            if not self.run['enable_splice_signals']:
-               for idx,elem in enumerate(currentDon):
-                  if elem != -inf:
-                     currentDon[idx] = 0.0
-
-               for idx,elem in enumerate(currentAcc):
-                  if elem != -inf:
-                     currentAcc[idx] = 0.0
-
-            current_prediction = self.calc_alignment(currentDNASeq, read,\
-            quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
-
-            current_prediction['id']         = id
-            current_prediction['chr']        = chromo
-            current_prediction['strand']     = strand
-            current_prediction['start_pos']  = genomicSeq_start
-
-            allPredictions.append(current_prediction)
-
-      if not self.qpalma_debug_mode:
-         self.FinalizePrediction(allPredictions)
-      else:
-         return allPredictions
-
-
-   def FinalizePrediction(self,allPredictions):
-      """ End of the prediction loop we save all predictions in a pickle file and exit """
-
-      cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
-      self.plog('Prediction completed\n')
-      mes =  'Problem ctr %d' % self.problem_ctr
-      self.plog(mes+'\n')
-      self.logfh.close()
-
-
-   def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
-      """
-      Given two sequences and the parameters we calculate on alignment
-      """
-
-      run = self.run
-      donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
-
-      if '-' in read:
-         self.plog('found gap\n')
-         read = read.replace('-','')
-         assert len(read) == Conf.read_size
-
-      ps = h.convert2SWIG()
-
-      newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
-      newQualityPlifsFeatures, dna_array, read_array =\
-      self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
-
-      mm_len = run['matchmatrixRows']*run['matchmatrixCols']
-
-      true_map    = [0]*2
-      true_map[0] = 1
-      pathNr      = 0
-
-      _newSpliceAlign   = array.array('B',newSpliceAlign)
-      _newEstAlign      = array.array('B',newEstAlign)
-       
-      #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
-      alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array) 
-
-      dna_array   = array.array('B',dna_array)
-      read_array  = array.array('B',read_array)
-
-      newExons = self.calculatePredictedExons(newSpliceAlign)
-
-      current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
-      'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
-      'dna_array':dna_array, 'read_array':read_array }
-
-      return current_prediction
-
-
-   def calculatePredictedExons(self,SpliceAlign):
-      newExons = []
-      oldElem = -1
-      SpliceAlign.append(-1)
-      for pos,elem in enumerate(SpliceAlign):
-         if pos == 0:
-            oldElem = -1
-         else:
-            oldElem = SpliceAlign[pos-1]
-
-         if oldElem != 0 and elem == 0: # start of exon
-            newExons.append(pos)
-
-         if oldElem == 0 and elem != 0: # end of exon
-            newExons.append(pos)
-
-      return newExons
diff --git a/scripts/resurrect b/scripts/resurrect
deleted file mode 100755 (executable)
index 3e7e980..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-instance=$1
-
-source $HOME/.bashrc
-
-#python -c "import sys; from cPickle import *; ma=load(open('lmm_${instance}.pickle'))
-#
-#try:
-#   ma.run()
-#except:
-#   sys.exit(99)"
-
-python -c "import sys; from cPickle import *; ma=load(open('${instance}')); ma.train()"