+ extended docu
authorFabio <fabio@congo.fml.local>
Fri, 10 Oct 2008 13:44:00 +0000 (15:44 +0200)
committerFabio <fabio@congo.fml.local>
Fri, 10 Oct 2008 13:44:00 +0000 (15:44 +0200)
+ removed legacy code concerning run object
+ unified tests
+ added creation of artifical small dataset for faster debugging

16 files changed:
README
doc/qpalma-manual.tex [new file with mode: 0644]
doc/qpalma.tex [deleted file]
qpalma/SIQP.py
qpalma/SIQP_CVXOPT.py
qpalma/SettingsParser.py
qpalma/computeSpliceAlignWithQuality.py
qpalma/gridtools.py
qpalma/qpalma_main.py
qpalma/set_param_palma.py
scripts/qpalma_pipeline.py
test.conf
tests/test_qpalma.py [new file with mode: 0644]
tests/test_qpalma_prediction.py [deleted file]
tests/test_sequence_utils.py [deleted file]
tests/test_utils.py [new file with mode: 0644]

diff --git a/README b/README
index 9fec71f..f704ae1 100644 (file)
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-Installation instructions for QPalma Version 1.0.
+Installation instructions for QPalma Version 1.0
   
 The package requires version 2.5 or newer of Python, and is built from 
 source, so the header files and libraries for Python must be installed, 
@@ -21,7 +21,7 @@ or ask your sysadmin to install the packages.
 
 Pythongrid and Genefinding are two utility packages from our lab. These can be obtained via:
    
-   http://
+   http://www.fml.tuebingen.mpg.de/raetsch/projects/qpalma
 
 Once you have the above packages installed you can start installing QPalma.
 
@@ -45,18 +45,15 @@ You will need the following variables set:
 - 
 
 
+Further information
+-------------------
 
-
-
-
-
-
-Troubleshooting
----------------
+   More extensive documentation can be found under the doc/ directory of the
+   QPalma distribution.
 
 
 
 Contact
 -------
 
-If however everything failes contact: fabio@tuebingen.mpg.de
+   fabio@tuebingen.mpg.de
diff --git a/doc/qpalma-manual.tex b/doc/qpalma-manual.tex
new file mode 100644 (file)
index 0000000..78f011e
--- /dev/null
@@ -0,0 +1,241 @@
+\documentclass{article}
+\usepackage{a4}
+
+\begin{document}
+
+\newcommand{\QP}{{\sl QPALMA }}
+\newcommand{\QPA}{{\sl QPALMA alignment algorithm }}
+\newcommand{\QPH}{{\sl QPALMA approximation }}
+\newcommand{\QPP}{{\sl QPALMA pipeline }}
+
+\title{QPalma Documentation}
+\author{Fabio De Bona}
+\date{October 2008}
+
+\maketitle
+%
+%
+%
+\section{Intro}
+
+\QP is an alignment tool targeted to align spliced reads produced by ``Next
+Generation'' sequencing platforms such as \emph{Illumina Genome Analyzer} or \emph{454}.
+The basic idea is to use an extended Smith-Waterman algorithm for local
+alignments that uses the base quality information of the reads directly in the
+alignment step. Optimal alignment parameters i.e. scoring matrices are inferred
+using a machine learning technique similar to \emph{Support Vector Machines}.
+For further details on \QP itself consult the paper \cite{DeBona08}. For
+details about the learning method see \cite{Tsochantaridis04}.
+
+%
+%
+%
+\section{Installation}
+
+The following installation notes assume that you are working on a \emph{Linux}/*NIX
+environment. \emph{Windows} or \emph{MacOS} versions do not exist and are not planned. You
+should have a working \emph{DRMAA} (http://www.drmaa.org/) installation to
+submit jobs to a cluster. \QP is written in C++ and Python and tested on the
+\emph{gcc} compiler suite and python2.5.
+
+\subsection{Dependencies}
+
+\QP was designed to be as self-contained as possible. However it still has some dependencies.
+In order to install \QP completely you will need:
+\begin{itemize}
+\item $SWIG$, the simple wrapper interface generator,
+\item[$\rightarrow$] http://www.swig.org
+\item $numpy$, a python package for numeric computations,
+\item[$\rightarrow$] http://numpy.scipy.org/
+\item Pythongrid a package for cluster computation, and
+\item Genefinding tools which offers some data formats.
+\end{itemize}
+The latter two packages can be obtained from the \QP website. \\ \noindent
+For training \QP you need one of the following optimization toolkits:
+\begin{itemize}
+\item CVXOPT (http://abel.ee.ucla.edu/cvxopt, Free)
+\item MOSEK (http://www.mosek.com, Commercial)
+\item CPLEX (http://www.ilog.com/products/cplex, Commercial)
+\end{itemize}
+
+\subsection{Step by step installation guide}
+
+\begin{enumerate}
+\item Install the Pythongrid and the Genefinding tool packages
+\item Update your PYTHONPATH variable to poin to the above packages
+\item Unpack the QPalma tarball via
+\item[$\rightarrow$] tar -xzvf QPalma-1.0.tar.gz
+\item Enter the QPalma-1.0 directory and type:
+\item[$\rightarrow$] python setup.py build
+\end{enumerate}
+\noindent
+In order to check your installation you can run the script
+test\_qpalma\_installation.py which can be found in the directory tests/. This
+script either reports a successful installation or generates an error log file.
+You can send this error log file to qpalma@tuebingen.mpg.de.
+\\ \noindent
+In order to make a full test run with sample data you can run the script
+test\_complete\_pipeline.py. This work on a small artificial dataset performs
+some checks.
+
+%
+%
+%
+\section{Working with \QP}
+
+I assume now that you have a successful \QP installation. When working with \QP
+you usually only deal with two commands:
+
+\begin{itemize}
+\item python qpalma\_pipeline.py train example.conf
+\item python qpalma\_pipeline.py predict example.conf
+\end{itemize}
+\noindent
+\QP has two modes \emph{predict} and \emph{train} all settings are supplied in
+a configuration file here example.conf. \\ \noindent
+
+A typical run consists of 
+\begin{enumerate}
+\item Set your parameters (read size, number of mismatches, etc.) in the
+configuration files.
+\end{enumerate}
+
+Basically \QP assumes that you have the following data:
+\begin{itemize}
+\item The reads you want to align,
+\item parts/full genomic sequences of an organism,
+\item splice site scores predicted for the genomic sequences.
+\end{itemize}
+
+\QP has one central configuration file: 
+
+Suppose you have a
+
+The project results directory (\emph{result\_dir}) contains then the subdirectories
+\begin{itemize}
+\item \emph{mapping} with subdirs main and spliced
+\item \emph{alignment} with subdirs for the different parameters and \emph{heuristic}
+\item \emph{remapping}
+\end{itemize}
+
+\subsection{The configuration file}
+
+Via the variable ``result\_dir'' you can specify where all of QPalma's data should reside.
+This directory contains the following subdirectories:
+\begin{itemize}
+\item preprocessing
+\item approximation
+\item prediction
+\item postprocessing, and 
+\item training
+\end{itemize}
+
+%
+%
+%
+\section{Pipeline}
+
+The full pipline consists of $n$ steps:
+
+\begin{enumerate}
+\item Find alignment seeds using a fast suffix array method (vmatch) for
+all given reads. This may take several rounds for subsets of the reads.
+\item Preprocess the reads and their seeds. Convert them to a qpalma format with some sanity checks.
+\item Use the \QPH to identify those reads that have a full seed but might be
+spliced anyways.
+\item Once we identified all potentially spliced reads we use \QPA to align
+those to their seed regions.
+\item One can choose between several post-processing steps in order to refine
+the quality of the alignments via filtering.
+\end{enumerate}
+
+%
+%
+%
+\section{File Formats / Specifications}
+
+This section introduces all formats and conventions that are assumed to be met
+by the users in order to make \QP work.
+
+\subsection{Format of the configuration file}
+
+The configuration file includes are settings \QP needs to perform an analysis.
+This includes paths to file where the raw data exists as well as settings which
+sequencing platform is being used,the number of cluster nodes to employ etc. .
+
+Its values are in the form 
+\begin{center}
+key = value
+\end{center}
+and ``\#'' for lines containing comments.
+
+\subsection{Read format and internal representation}
+
+The format of the file containing the mapped short reads is as follows.  Each
+line corresponds to one short read. Each line has six tab-separated entries,
+namely:
+
+\begin{enumerate}
+\item unique read id
+\item chromosome/contig id
+\item position of match in chromosome/contig
+\item strand 
+\item read sequence (in strand specific direction)
+\item read quality (in strand specific direction)
+\end{enumerate}
+
+Strand specific direction means that \QP assumes that the reads are already in
+their true orientation and the qualities as well. Internally there is no
+reverse complementing taking place.
+
+\subsection{Splice Scores}
+
+The splice site scores where generated by the software... . If you would like
+to use your own splice site predictions you can create files according to the
+format \QP uses.  This splice site scores format is described. For each
+canonical acceptor ($AG$) and donor site ($GT$/$GC$) \QP expects a score. For every
+chromosome or contig we have a four files.  For each strand we have a binary
+file containing the positions and a binary file containing the scores. The
+positions are stored as unsigned values and the scores as floats.  The
+positions are 1-based and the assignment of positions and their scores is as
+follows: The acceptor score positions are the positions right after the $AG$ and the
+donor score positions are the positions right on the $G$ of the $GT$. For example:
+\begin{center}
+\begin{tabular}{ccccccccccc}
+... & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 & 11 & ... \\ 
+... & w & g & t & x & y & z & a & g  & v  & ... \\
+... &   & 0.2&  &   &   &   &   &    & 0.3 & ... 
+\end{tabular}
+\end{center}
+We supply a script  for conversion of ascii to binary files. You can use this
+script as a template to make your own scoring information files.
+
+\section{Remarks}
+
+The \QP project is licensed under the GPL. \\ \noindent
+The official \QP project email address is:
+\begin{center}
+   qpalma@tuebingen.mpg.de
+\end{center}
+
+%
+% Bibliography
+% 
+
+\begin{thebibliography}{1}
+
+\bibitem[1]{DeBona08} 
+   De~Bona~F.~and~Ossowski~S.~and~Schneeberger~K.~and~G.~R{\"a}tsch
+   \newblock Optimal Spliced Alignment of Short Sequence Reads
+   \newblock {\em ECCB 2008}
+
+\bibitem[2]{Tsochantaridis04} 
+   Ioannis~Tsochantaridis~and~Thomas~Hofmann~and~Thorsten~Joachims~and~Yasemin~Altun
+   \newblock Support Vector Machine Learning for Interdependent and Sturcutured Output Spaces
+   \newblock {\em Proceedings of the 16th International Conference on Machine Learning}, 2004
+
+\end{thebibliography}
+%
+%
+%
+\end{document}
diff --git a/doc/qpalma.tex b/doc/qpalma.tex
deleted file mode 100644 (file)
index 5e1532e..0000000
+++ /dev/null
@@ -1,240 +0,0 @@
-\documentclass{article}
-\usepackage{a4}
-
-\begin{document}
-
-\newcommand{\QP}{{\sl QPALMA }}
-\newcommand{\QPA}{{\sl QPALMA alignment algorithm }}
-\newcommand{\QPH}{{\sl QPALMA approximation }}
-\newcommand{\QPP}{{\sl QPALMA pipeline }}
-
-\title{QPalma Documentation}
-\author{Fabio De Bona}
-\date{}
-
-\maketitle
-%
-%
-%
-\section{Intro}
-
-\QP is an alignment tool targeted to align spliced reads produced by ``Next
-Generation'' sequencing platforms such as $Illumina Genome Analyzer$ or $454$.
-The basic idea is to use an extended Smith-Waterman algorithm for local
-alignments that uses the base quality information of the reads directly in the
-alignment step. Optimal alignment parameters i.e. scoring matrices are inferred
-using a machine learning technique similar to \emph{Support Vector Machines}.
-For further details on \QP itself consult the paper \cite{DeBona08}.  For
-details about the learning method see \cite{Tsochantaridis04}.
-%We refer to the whole pipeline as the \QP pipeline and \QP respectively.
-
-\section{Quicktour}
-
-Basically \QP assumes that you have the following data:
-\begin{itemize}
-\item The reads you want to align,
-\item parts/full genomic sequences of an organism,
-\item splice site scores predicted for the genomic sequences.
-\end{itemize}
-
-\QP has one central configuration file: 
-
-Suppose you have a
-
-The project results directory (\emph{result\_dir}) contains then the subdirectories
-\begin{itemize}
-\item \emph{mapping} with subdirs main and spliced
-\item \emph{alignment} with subdirs for the different parameters and \emph{heuristic}
-\item \emph{remapping}
-\end{itemize}
-
-
-Via the variable ``result\_dir'' you can specify where all of QPalma's data should reside.
-This directory contains the following subdirectories:
-\begin{itemize}
-\item preprocessing
-\item approximation
-\item prediction
-\item postprocessing, and 
-\item training
-\end{itemize}
-
-%
-%
-%
-\section{Installation}
-
-QPalma has the following requirements:
-\begin{itemize}
-\item Numpy
-\item In order to use QPalma on a cluster you need the pythongrid package which
-can be found under the following URL:
-\item For training you need either one of the following optimization toolkits:
-\begin{itemize}
-\item CPLEX
-\item CVXOPT
-\item MOSEK
-\end{itemize}
-\end{itemize}
-
-
-%
-%
-%
-\section{Pipeline}
-
-The full pipline consists of $n$ steps:
-
-\begin{enumerate}
-\item Find alignment seeds using a fast suffix array method (vmatch) for
-all given reads. This may take several rounds for subsets of the reads.
-\item Preprocess the reads and their seeds. Convert them to a qpalma format with some sanity checks.
-\item Use the \QPH to identify those reads that have a full seed but might be
-spliced anyways.
-\item Once we identified all potentially spliced reads we use \QPA to align
-those to their seed regions.
-\item One can choose between several post-processing steps in order to refine
-the quality of the alignments via filtering.
-\end{enumerate}
-
-%
-%
-%
-\section{Usage}
-
-A typical run consists of 
-\begin{enumerate}
-\item Set your parameters (read size, number of mismatches, etc.) in the
-configuration files.
-\item Start the QPalma pipeline via \emph{start\_pipeline}.
-\end{enumerate}
-
-\section{Options}
-
-First one creates datasets using run-specific preprocessing tools. After
-dataset creation one checks the sets for consistency using the
-check\_dataset\_consistency.py script.
-
-\section{QPalma Commands}
-
-\subsection{check\_and\_init}
-
-Performs sanity checking of the configurations file(s). Initializes needed
-directories.
-%
-%
-%
-\section{Training}
-
-QPalma needs some training examples
-For the training you need:
-
-\begin{itemize}
-\item Training examples i.e. correct alignments (you can artificially generate those see \ref)
-\item Splice site predictions
-\item Flat files of the genomic sequences you want to align to
-\end{itemize}
-
-A training example is definded as a $4$-tuple, with elements:
-\begin{enumerate}
-\item A sequence information tuple
-\item the read itself
-\item the quality tuple
-\item the alignment information
-\end{enumerate}
-
-A prediction example is defined as a $3$-tuple, with elements:
-\begin{enumerate}
-\item A sequence information tuple
-\item the read itself
-\item the quality tuple
-\end{enumerate}
-
-The sequence information tuple itself consists of
-\begin{enumerate}
-\item The read id
-\item Chromosome/Contig id
-\item Strand
-\item Start of the genomic region we want to align to
-\item Stop of the genomic region
-\end{enumerate}
-
-%
-%
-%
-\section{Format Specifications}
-
-This section introduces all formats and conventions that are assumed to be met
-by the users in order to make \QP work.
-
-\subsection{Format of the configuration file}
-
-The configuration file includes are settings \QP needs to perform an analysis.
-This includes paths to file where the raw data exists as well as settings which
-sequencing platform is being used,the number of cluster nodes to employ etc. .
-
-Its values are in the form 
-\begin{center}
-key = value
-\end{center}
-and ``#'' for lines containing comments.
-
-
-\subsection{File Formats}
-The format of the file containing the mapped short reads is as follows.  Each
-line corresponds to one short read. Each line has six tab separated entries,
-namely:
-\begin{enumerate}
-\item unique read id
-\item chromosome/contig id
-\item position of match in chromosome/contig
-\item strand 
-\item read sequence
-\item read quality
-\end{enumerate}
-
-\subsection{Read format and internal representation}
-
-\begin{itemize}
-\item Which nucleotide bears the score of the splice site ?
-\item What exactly are the exon/intron boundaries pointing to (also are they 0/1-based) ?
-\end{itemize}
-
-We address the first question as follows:
-\texttt{
-----gt-------ag------
-    *         *
-}
-the score is assumed to be saved at the position of the g's of the splice sites.
-
-\subsection{Splice Scores}
-
-The splice site scores where generated by ... . However if you train by
-yourself then you can use splice site predictions of any kind... as long as the
-prediction of one site is a real value.
-
-
-Dependencies so far
-
-- SWIG
-- numpy
-- pythongrid
-- Genefinding doIntervalQuery
-
-\begin{thebibliography}{1}
-
-\bibitem[1]{DeBona08} 
-   De~Bona~F.~and~Ossowski~S.~and~Schneeberger~K.~and~G.~R{\"a}tsch
-   \newblock Optimal Spliced Alignment of Short Sequence Reads
-   \newblock {\em ECCB 2008}
-
-\bibitem[2]{Tsochantaridis04} 
-   Ioannis~Tsochantaridis~and~Thomas~Hofmann~and~Thorsten~Joachims~and~Yasemin~Altun
-   \newblock Support Vector Machine Learning for Interdependent and Sturcutured Output Spaces
-   \newblock {\em Proceedings of the 16th International Conference on Machine Learning}, 2004
-
-\end{thebibliography}
-%
-%
-%
-\end{document}
index c994785..66b8003 100644 (file)
@@ -37,7 +37,7 @@ class SIQP:
 
    """
 
-   def __init__(self,fSize,numExamples,c,run):
+   def __init__(self,fSize,numExamples,c,settings):
       assert fSize > 0, 'You have to have at least one feature!'
       assert numExamples > 0, 'You have to have at least one example!'
       self.numFeatures = fSize
@@ -45,7 +45,7 @@ class SIQP:
       self.C = c
       self.EPSILON = 10e-15
 
-      self.run = run
+      self.settings = settings
 
       logging.debug("Starting SIQP solver with %d examples. A feature space dim. of %d.", numExamples,fSize)
       logging.debug("Regularization parameters are C=%f."%(self.C))
@@ -63,19 +63,19 @@ class SIQP:
       # end of zeroing regularizer
 
    def createSmoothnessRegularizer(self):
-      run = self.run
+      settings = self.settings
 
       self.P = cb.matrix(0.0,(self.dimP,self.dimP))
       for i in range(self.numFeatures):
          self.P[i,i] = 1.0
 
-      lengthSP    = run['numLengthSuppPoints']
-      donSP       = run['numDonSuppPoints']
-      accSP       = run['numAccSuppPoints']
-      mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
-      numq        = run['numQualSuppPoints']
-      totalQualSP = run['totalQualSuppPoints']
-      numQPlifs = run['numQualPlifs'] 
+      lengthSP    = settings['numLengthSuppPoints']
+      donSP       = settings['numDonSuppPoints']
+      accSP       = settings['numAccSuppPoints']
+      mmatrixSP   = settings['matchmatrixRows']*settings['matchmatrixCols']
+      numq        = settings['numQualSuppPoints']
+      totalQualSP = settings['totalQualSuppPoints']
+      numQPlifs   = settings['numQualPlifs'] 
 
       #lengthGroupParam  = 5*1e-9
       #spliceGroupParam  = 1e-9
index d3be2e5..95260e9 100644 (file)
@@ -42,8 +42,8 @@ class SIQPSolver(SIQP):
    
    """
 
-   def __init__(self,fSize,numExamples,c,proto,run):
-      SIQP.__init__(self,fSize,numExamples,c,run)
+   def __init__(self,fSize,numExamples,c,proto,settings):
+      SIQP.__init__(self,fSize,numExamples,c,settings)
       self.numConstraints = 0
       self.solver_runs = 0
       self.old_objective_value = -(sys.maxint-1)
index 700c896..83b0858 100644 (file)
 #
 
 import os 
+import re
 import os.path 
 import sys 
+import pdb
 
 jp = os.path.join
 
@@ -26,7 +28,10 @@ def parseSettingsFile(filename):
       if (not line.strip()) or line.startswith('#'):
          pass
       else:
-         key, val = line.strip().replace(' ', '').split('=')
+         line = line.strip()
+         line = re.sub('\s', '', line)  
+         key, val = line.split('=')
+
          settings[key] = val
 
    return settings
@@ -67,18 +72,39 @@ def makeSettings(settings):
    except:
       print 'Error: There was a problem generating the logfile %s' % settings['global_log_fn']
 
-   try:
-      settings['num_splits'] = int(settings['num_splits'])
-   except:
-      print 'Error: num_splits has to be a positive integer'
+   # addd checking support!!
+   settings['allowed_fragments'] = eval(settings['allowed_fragments'])
 
-   try:
-      settings['prb_offset'] = int(settings['prb_offset'])
-   except:
-      print 'Error: prb_offset has to be a positive integer'
+   TF = [True,False]
 
-   settings['allowed_fragments'] = eval(settings['allowed_fragments'])
-   settings['half_window_size']  = int(settings['half_window_size'])
+   # convert and check all boolean parameters
+   for parameter in ['enable_quality_scores', 'enable_intron_length', 'enable_splice_scores',\
+   'remove_duplicate_scores', 'print_matrix']:
+
+      try:
+         settings[parameter]   = eval(settings[parameter])
+         assert settings[parameter] in TF
+      except:
+         print "Unexpected error:", sys.exc_info()[0]
+         print 'The value of %s has to be either True or False!'%parameter
+
+   # now convert and check all integer parameters
+   for parameter in ['matchmatrixCols', 'matchmatrixRows', 'numAccSuppPoints', 'numConstraintsPerRound',\
+   'numDonSuppPoints', 'numLengthSuppPoints', 'numQualPlifs', 'numQualSuppPoints', 'anzpath', 'iter_steps',\
+   'max_intron_len', 'max_qual', 'min_intron_len', 'min_qual', 'totalQualSuppPoints','C','num_splits','prb_offset','half_window_size']:
+         
+      try:
+         settings[parameter] = int(settings[parameter])
+      except:
+         print 'Error: %s has to be an integer!'%parameter
+
+   # now convert and check all integer parameters
+   for parameter in ['max_svm_score','min_svm_score']:
+         
+      try:
+         settings[parameter] = float(settings[parameter])
+      except:
+         print 'Error: %s has to be an float!'%parameter
 
    return settings
 
@@ -95,3 +121,6 @@ def parseSettings(filename):
    settings = makeSettings(settings)
    #assert checkSettings(settings)
    return settings
+
+if __name__ == '__main__':
+   print parseSettings(sys.argv[1])
index e498098..972a8de 100644 (file)
@@ -13,7 +13,7 @@ from Plif import Plf,base_coord,linspace
 
 import sequence_utils
 
-def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs, run):
+def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs, settings):
    """
    Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
    Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
@@ -71,7 +71,7 @@ def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, quali
    # start of label feature generation
    #  
 
-   sizeMatchmatrix = run['matchmatrixRows']*run['matchmatrixCols'] #
+   sizeMatchmatrix = settings['matchmatrixRows']*settings['matchmatrixCols'] #
    # counts the occurences of a,c,g,t,n in this order
 
    # counts the occurrences of a,c,g,t,n with their quality scores
@@ -81,7 +81,7 @@ def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, quali
 
    for row in range(5):
       for col in range(6):
-         trueWeightQualityMat[row][col] = [0.0]*run['numQualSuppPoints'] # supp. points per plif
+         trueWeightQualityMat[row][col] = [0.0]*settings['numQualSuppPoints'] # supp. points per plif
 
    dna_part = []
    est_part = []
@@ -108,7 +108,7 @@ def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, quali
    for orig_char in est_part:
       assert orig_char in sequence_utils.alphabet, pdb.set_trace()
 
-   trueWeightMatch = zeros((run['matchmatrixRows']*run['matchmatrixCols'],1)) # Scorematrix fuer Wahrheit
+   trueWeightMatch = zeros((settings['matchmatrixRows']*settings['matchmatrixCols'],1)) # Scorematrix fuer Wahrheit
 
    map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
 
@@ -153,12 +153,12 @@ def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, quali
             trueWeightQualityMat[estnum-1][dnanum][Upper] += weightup
             trueWeightQualityMat[estnum-1][dnanum][Lower] += weightlow
 
-      trueWeightQuality = zeros((run['numQualSuppPoints']*run['numQualPlifs'],1))
+      trueWeightQuality = zeros((settings['numQualSuppPoints']*settings['numQualPlifs'],1))
 
       ctr = 0
       for row in range(5):
          for col in range(6):
-            for p_idx in range(run['numQualSuppPoints']):
+            for p_idx in range(settings['numQualSuppPoints']):
                #print ctr, row, col, p_idx
                trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
                ctr += 1
@@ -187,22 +187,22 @@ def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, quali
    return SpliceAlign, trueWeightMatch, trueWeightQuality, dna_calc
 
 
-def  computeSpliceAlignScoreWithQuality(original_est, quality, qualityPlifs, run, weightvector):
+def  computeSpliceAlignScoreWithQuality(original_est, quality, qualityPlifs, settings, weightvector):
    """
    """
 
-   lengthSP    = run['numLengthSuppPoints']
-   donSP       = run['numDonSuppPoints']
-   accSP       = run['numAccSuppPoints']
-   mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
-   numq        = run['numQualSuppPoints']*5*6
+   lengthSP    = settings['numLengthSuppPoints']
+   donSP       = settings['numDonSuppPoints']
+   accSP       = settings['numAccSuppPoints']
+   mmatrixSP   = settings['matchmatrixRows']*settings['matchmatrixCols']
+   numq        = settings['numQualSuppPoints']*5*6
 
    lengthP = weightvector[0:lengthSP]
    donP = weightvector[lengthSP:lengthSP+donSP]
    accP = weightvector[donSP+lengthSP:donSP+lengthSP+accSP]
    mmatrixP = weightvector[accSP+donSP+lengthSP:accSP+donSP+lengthSP+mmatrixSP]
    numqP = weightvector[accSP+donSP+lengthSP+mmatrixSP:accSP+donSP+lengthSP+mmatrixSP+numq]
-   numQualSuppPoints=run['numQualSuppPoints']
+   numQualSuppPoints=settings['numQualSuppPoints']
    
    map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5, '[': 10, ']':11 }
    original_est_mapped = [ map[e] for e in original_est ]
index 5d29743..54aae6d 100644 (file)
@@ -191,12 +191,6 @@ class AlignmentTask(ClusterTask):
 
       jp = os.path.join
 
-      run   = cPickle.load(open(self.settings['run_fn']))
-      run['name'] = 'saved_run'
-
-      param_fn = self.settings['prediction_parameter_fn']
-
-      run['result_dir']    = self.settings['prediction_dir']
       dataset_fn           = self.settings['prediction_dataset_fn']
       prediction_keys_fn   = self.settings['prediction_dataset_keys_fn']
 
@@ -212,9 +206,9 @@ class AlignmentTask(ClusterTask):
             chunks.append((c_name,prediction_keys[slice[0]:slice[1]]))
 
       for c_name,current_chunk in chunks:
-         current_job = KybJob(gridtools.AlignmentTaskStarter,[self.settings,run,dataset_fn,current_chunk,param_fn,c_name])
-         current_job.h_vmem = '20.0G'
-         #current_job.express = 'True'
+         current_job = KybJob(gridtools.AlignmentTaskStarter,[self.settings,dataset_fn,current_chunk,c_name])
+         current_job.h_vmem = '2.0G'
+         current_job.express = 'True'
 
          print "job #1: ", current_job.nativeSpecification
 
@@ -227,14 +221,14 @@ class AlignmentTask(ClusterTask):
       print 'Got %d job(s)' % len(self.functionJobs)
 
 
-def AlignmentTaskStarter(settings,run,dataset_fn,prediction_keys,param_fn,set_name):
+def AlignmentTaskStarter(settings,dataset_fn,prediction_keys,set_name):
    """
   
    """
    accessWrapper = DataAccessWrapper(settings)
    seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
-   qp = QPalma(run,seqInfo)
-   qp.init_prediction(dataset_fn,prediction_keys,param_fn,set_name)
+   qp = QPalma(seqInfo)
+   qp.init_prediction(dataset_fn,prediction_keys,settings,set_name)
    return 'finished prediction of set %s.' % set_name
 
 
@@ -247,6 +241,10 @@ class TrainingTask(ClusterTask):
    def __init__(self):
       ClusterTask.__init__(self)
 
+   def CreateJobs(self):
+      pass
+      #cPickle.dump(settings,open(jp(,'training_settings.pickle''run_obj.pickle','w+'))
+
 
 class PostprocessingTask(ClusterTask):
    """
index b3349be..f4b8c89 100644 (file)
@@ -12,8 +12,6 @@ import os.path
 import pdb
 import sys
 
-from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
-
 import numpy
 from numpy.matlib import mat,zeros,ones,inf
 from numpy.linalg import norm
@@ -29,13 +27,17 @@ from qpalma.computeSpliceAlignWithQuality import *
 from qpalma.TrainingParam import Param
 from qpalma.Plif import Plf,compute_donacc
 
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
 from qpalma.utils import pprint_alignment, get_alignment
 
+jp = os.path.join
+dotProd = lambda x,y: (x.T * y)[0,0]
+
 class SpliceSiteException:
    pass
 
 
-def getData(training_set,exampleKey,run):
+def getData(training_set,exampleKey,settings):
    """ This function...  """
 
    currentSeqInfo,original_est,currentQualities,currentExons = training_set[exampleKey]
@@ -47,10 +49,8 @@ def getData(training_set,exampleKey,run):
    est = unbracket_est(est)
    est = est.replace('-','')
 
-   assert len(est) == run['read_size'], pdb.set_trace()
    est_len = len(est)
 
-   #original_est = OriginalEsts[exampleIdx]
    original_est = "".join(original_est)
    original_est = original_est.lower()
 
@@ -89,10 +89,9 @@ class QPalma:
    the alignment.
    """
    
-   def __init__(self,run,seqInfo,dmode=False):
+   def __init__(self,seqInfo,dmode=False):
       self.ARGS = Param()
       self.qpalma_debug_mode = dmode
-      self.run = run
       self.seqInfo = seqInfo
 
 
@@ -101,7 +100,7 @@ class QPalma:
       self.logfh.flush()
 
 
-   def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode):
+   def do_alignment(self,dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,current_num_path,prediction_mode,settings):
       """
       Given the needed input this method calls the QPalma C module which
       calculates a dynamic programming in order to obtain an alignment
@@ -114,7 +113,7 @@ class QPalma:
       chastity = QPalmaDP.createDoubleArrayFromList([.0]*est_len)
 
       matchmatrix = QPalmaDP.createDoubleArrayFromList(mmatrix.flatten().tolist()[0])
-      mm_len = self.run['matchmatrixRows']*self.run['matchmatrixCols']
+      mm_len = settings['matchmatrixRows']*settings['matchmatrixCols']
 
       d_len = len(donor)
       donor = QPalmaDP.createDoubleArrayFromList(donor)
@@ -122,13 +121,13 @@ class QPalma:
       acceptor = QPalmaDP.createDoubleArrayFromList(acceptor)
 
       # Create the alignment object representing the interface to the C/C++ code.
-      currentAlignment = QPalmaDP.Alignment(self.run['numQualPlifs'],self.run['numQualSuppPoints'], self.use_quality_scores)
+      currentAlignment = QPalmaDP.Alignment(settings['numQualPlifs'],settings['numQualSuppPoints'], settings['enable_quality_scores'])
       c_qualityPlifs = QPalmaDP.createPenaltyArrayFromList([elem.convert2SWIG() for elem in qualityPlifs])
       # calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
       currentAlignment.myalign( current_num_path, dna, dna_len,\
        est, est_len, prb, chastity, ps, matchmatrix, mm_len, donor, d_len,\
-       acceptor, a_len, c_qualityPlifs, self.run['remove_duplicate_scores'],\
-       self.run['print_matrix'] )
+       acceptor, a_len, c_qualityPlifs, settings['remove_duplicate_scores'],\
+       settings['print_matrix'] )
 
       if prediction_mode:
          # part that is only needed for prediction
@@ -145,8 +144,8 @@ class QPalma:
       newQualityPlifsFeatures, dna_array, est_array
 
 
-   def init_train(self,training_set):
-      full_working_path = os.path.join(self.run['alignment_dir'],self.run['name'])
+   def init_train(self,settings,run_name):
+      full_working_path = jp(settings['alignment_dir'],run_name)
 
       #assert not os.path.exists(full_working_path)
       if not os.path.exists(full_working_path):
@@ -158,37 +157,11 @@ class QPalma:
       os.chdir(full_working_path)
 
       self.logfh = open('_qpalma_train.log','w+')
-      cPickle.dump(self.run,open('run_obj.pickle','w+'))
-
-      self.plog("Settings are:\n")
-      self.plog("%s\n"%str(self.run))
-
-      if self.run['mode'] == 'normal':
-         self.use_quality_scores = False
-
-      elif self.run['mode'] == 'using_quality_scores':
-         self.use_quality_scores = True
-      else:
-         assert(False)
-
-
-   def setUpSolver(self):
-      # Initialize solver 
-      self.plog('Initializing problem...\n')
-      
-      try:
-         solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
-      except:
-         self.plog('Got no license. Telling queue to reschedule job...\n')
-         sys.exit(99)
-
-      solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
-      solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
 
-      return solver
+      train(settings)
 
 
-   def train(self,training_set):
+   def train(self,settings,training_set):
       """
       The mainloop for training.
       """
@@ -199,40 +172,48 @@ class QPalma:
       self.noImprovementCtr = 0
       self.oldObjValue = 1e8
 
-      iteration_steps         = self.run['iter_steps']
-      remove_duplicate_scores = self.run['remove_duplicate_scores']
-      print_matrix            = self.run['print_matrix']
-      anzpath                 = self.run['anzpath']
+      remove_duplicate_scores = settings['remove_duplicate_scores']
+      print_matrix            = settings['print_matrix']
+
+      lengthSP    = settings['numLengthSuppPoints']
+      donSP       = settings['numDonSuppPoints']
+      accSP       = settings['numAccSuppPoints']
+      mmatrixSP   = settings['matchmatrixRows']*settings['matchmatrixCols']
+      numq        = settings['numQualSuppPoints']
+      totalQualSP = settings['totalQualSuppPoints']
+
+      # calculate the total number of features
+      numFeatures = lengthSP+donSP+accSP+mmatrixSP*numq
 
       # Initialize parameter vector
-      param = numpy.matlib.rand(run['numFeatures'],1)
-   
-      lengthSP    = self.run['numLengthSuppPoints']
-      donSP       = self.run['numDonSuppPoints']
-      accSP       = self.run['numAccSuppPoints']
-      mmatrixSP   = self.run['matchmatrixRows']*run['matchmatrixCols']
-      numq        = self.run['numQualSuppPoints']
-      totalQualSP = self.run['totalQualSuppPoints']
+      param = numpy.matlib.rand(numFeatures,1)
 
       # no intron length model
-      if not self.run['enable_intron_length']:
+      if not settings['enable_intron_length']:
          param[:lengthSP] *= 0.0
 
       # Set the parameters such as limits penalties for the Plifs
-      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
+      [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation,settings)
+
+      # Initialize solver 
+      self.plog('Initializing problem...\n')
+      
+      try:
+         solver = SIQPSolver(numFeatures,numExamples,settings['C'],self.logfh,settings)
+      except:
+         self.plog('Got no license. Telling queue to reschedule job...\n')
+         sys.exit(99)
 
-      solver = self.setUpSolver()
+      solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
+      solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
 
       # stores the number of alignments done for each example (best path, second-best path etc.)
-      num_path = [anzpath]*numExamples
+      num_path = settings['anzpath']*numExamples
 
-      # stores the gap for each example
-      gap      = [0.0]*numExamples
-
-      currentPhi = zeros((run['numFeatures'],1))
+      currentPhi = zeros((numFeatures,1))
       totalQualityPenalties = zeros((totalQualSP,1))
 
-      numConstPerRound = run['numConstraintsPerRound']
+      numConstPerRound = settings['numConstraintsPerRound']
       solver_call_ctr = 0
 
       suboptimal_example = 0
@@ -240,30 +221,29 @@ class QPalma:
       param_idx = 0
       const_added_ctr = 0
 
-      featureVectors = zeros((run['numFeatures'],numExamples))
+      featureVectors = zeros((numFeatures,numExamples))
 
       self.plog('Starting training...\n')
       # the main training loop
       while True:
-         if iteration_nr == iteration_steps:
+         if iteration_nr == settings['iter_steps']:
             break
 
          for exampleIdx,example_key in enumerate(training_set.keys()):
             print 'Current example %d' % example_key
+
             try:
                dna,est,acc_supp,don_supp,exons,original_est,currentQualities =\
-               getData(training_set,example_key,run)
+               getData(training_set,example_key,settings)
             except SpliceSiteException:
                continue
 
-            dna_len = len(dna)
-
-            if run['enable_quality_scores']:
+            if settings['enable_quality_scores']:
                quality = currentQualities[quality_index]
             else:
                quality = [40]*len(read)
 
-            if not run['enable_splice_signals']:
+            if not settings['enable_splice_scores']:
                for idx,elem in enumerate(don_supp):
                   if elem != -inf:
                      don_supp[idx] = 0.0
@@ -272,11 +252,12 @@ class QPalma:
                   if elem != -inf:
                      acc_supp[idx] = 0.0
 
+
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
-            if run['mode'] == 'using_quality_scores':
+            if settings['enable_quality_scores']:
                trueSpliceAlign, trueWeightMatch, trueWeightQuality ,dna_calc =\
                computeSpliceAlignWithQuality(dna, exons, est, original_est,\
-               quality, qualityPlifs,run)
+               quality, qualityPlifs,settings)
             else:
                trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
 
@@ -293,7 +274,7 @@ class QPalma:
             currentPhi[lengthSP+donSP:lengthSP+donSP+accSP]                   = mat(a.penalties[:]).reshape(accSP,1)
             currentPhi[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP]   = mmatrix[:]
 
-            if run['mode'] == 'using_quality_scores':
+            if settings['enable_quality_scores']:
                totalQualityPenalties = param[-totalQualSP:]
                currentPhi[lengthSP+donSP+accSP+mmatrixSP:]                    = totalQualityPenalties[:]
 
@@ -303,7 +284,7 @@ class QPalma:
             # The allWeights vector is supposed to store the weight parameter
             # of the true alignment as well as the weight parameters of the
             # num_path[exampleIdx] other alignments
-            allWeights = zeros((run['numFeatures'],num_path[exampleIdx]+1))
+            allWeights = zeros((numFeatures,num_path[exampleIdx]+1))
             allWeights[:,0] = trueWeight[:,0]
 
             AlignmentScores = [0.0]*(num_path[exampleIdx]+1)
@@ -318,13 +299,13 @@ class QPalma:
 
             newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
             newQualityPlifsFeatures, unneeded1, unneeded2 =\
-            self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False)
-            mm_len = run['matchmatrixRows']*run['matchmatrixCols']
+            self.do_alignment(dna,est,quality,mmatrix,donor,acceptor,ps,qualityPlifs,num_path[exampleIdx],False,settings)
+            mm_len = settings['matchmatrixRows']*settings['matchmatrixCols']
 
-            newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],dna_len)
+            newSpliceAlign = newSpliceAlign.reshape(num_path[exampleIdx],len(dna))
             newWeightMatch = newWeightMatch.reshape(num_path[exampleIdx],mm_len)
 
-            newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],run['totalQualSuppPoints'])
+            newQualityPlifsFeatures = newQualityPlifsFeatures.reshape(num_path[exampleIdx],settings['totalQualSuppPoints'])
             # Calculate weights of the respective alignments. Note that we are calculating n-best alignments without 
             # hamming loss, so we have to keep track which of the n-best alignments correspond to the true one in order 
             # not to incorporate a true alignment in the
@@ -339,7 +320,7 @@ class QPalma:
                h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
                acc_supp)
               
-               decodedQualityFeatures = zeros((run['totalQualSuppPoints'],1))
+               decodedQualityFeatures = zeros((settings['totalQualSuppPoints'],1))
                decodedQualityFeatures = newQualityPlifsFeatures[pathNr,:].T
                # Gewichte in restliche Zeilen der Matrix speichern
                allWeights[:,pathNr+1] = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures[:]])
@@ -444,7 +425,7 @@ class QPalma:
                cPickle.dump(param,open('param_%d.pickle'%param_idx,'w+'))
                param_idx += 1
                [h,d,a,mmatrix,qualityPlifs] =\
-               set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
+               set_param_palma(param,self.ARGS.train_with_intronlengthinformation,settings)
 
          ##############################################
          # end of one iteration through all examples  #
@@ -478,14 +459,13 @@ class QPalma:
 ###############################################################################
 
 
-   def init_prediction(self,dataset_fn,prediction_keys,param_fn,set_name):
+   def init_prediction(self,dataset_fn,prediction_keys,settings,set_name):
       """
       Performing a prediction takes...
       """
       self.set_name = set_name
 
-      #full_working_path = os.path.join(run['alignment_dir'],run['name'])
-      full_working_path = self.run['result_dir']
+      full_working_path = settings['prediction_dir']
 
       print 'full_working_path is %s' % full_working_path 
 
@@ -514,27 +494,19 @@ class QPalma:
       del dataset
    
       # Load parameter vector to predict with
-      param = cPickle.load(open(param_fn))
+      param = cPickle.load(open(settings['prediction_param_fn']))
 
-      self.predict(prediction_set,param)
+      self.predict(prediction_set,param,settings)
 
 
-   def predict(self,prediction_set,param):
+   def predict(self,prediction_set,param,settings):
       """
       This method...
       """
 
-      if self.run['mode'] == 'normal':
-         self.use_quality_scores = False
-
-      elif self.run['mode'] == 'using_quality_scores':
-         self.use_quality_scores = True
-      else:
-         assert(False)
-
       # Set the parameters such as limits/penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] =\
-      set_param_palma(param,self.ARGS.train_with_intronlengthinformation,self.run)
+      set_param_palma(param,self.ARGS.train_with_intronlengthinformation,settings)
 
       if not self.qpalma_debug_mode:
          self.plog('Starting prediction...\n')
@@ -560,7 +532,7 @@ class QPalma:
             if not self.qpalma_debug_mode:
                self.plog('Loading example id: %d...\n'% int(id))
 
-            if self.run['enable_quality_scores']:
+            if settings['enable_quality_scores']:
                quality = currentQualities[quality_index]
             else:
                quality = [40]*len(read)
@@ -571,7 +543,7 @@ class QPalma:
                self.problem_ctr += 1
                continue
 
-            if not self.run['enable_splice_signals']:
+            if not settings['enable_splice_scores']:
                for idx,elem in enumerate(currentDon):
                   if elem != -inf:
                      currentDon[idx] = 0.0
@@ -581,7 +553,7 @@ class QPalma:
                      currentAcc[idx] = 0.0
 
             current_prediction = self.calc_alignment(currentDNASeq, read,\
-            quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
+            quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs,settings)
 
             current_prediction['id']         = id
             current_prediction['chr']        = chromo
@@ -606,12 +578,11 @@ class QPalma:
       self.logfh.close()
 
 
-   def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs):
+   def calc_alignment(self, dna, read, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs,settings):
       """
       Given two sequences and the parameters we calculate on alignment
       """
 
-      run = self.run
       donor, acceptor = compute_donacc(don_supp, acc_supp, d, a)
 
       if '-' in read:
@@ -623,9 +594,9 @@ class QPalma:
 
       newSpliceAlign, newEstAlign, newWeightMatch, newDPScores,\
       newQualityPlifsFeatures, dna_array, read_array =\
-      self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True)
+      self.do_alignment(dna,read,quality,mmatrix,donor,acceptor,ps,qualityPlifs,1,True,settings)
 
-      mm_len = run['matchmatrixRows']*run['matchmatrixCols']
+      mm_len = settings['matchmatrixRows']*settings['matchmatrixCols']
 
       true_map    = [0]*2
       true_map[0] = 1
index 9ad4abd..036b693 100644 (file)
@@ -15,26 +15,26 @@ import QPalmaDP
 import pdb
 from Plif import *
 
-def set_param_palma(param, train_with_intronlengthinformation, run):
+def set_param_palma(param, train_with_intronlengthinformation, settings):
 
    print 'Setting parameters ...'
    
-   min_intron_len = run['min_intron_len'] 
-   max_intron_len = run['max_intron_len']
+   min_intron_len = settings['min_intron_len'] 
+   max_intron_len = settings['max_intron_len']
 
-   min_svm_score = run['min_svm_score'] 
-   max_svm_score = run['max_svm_score'] 
+   min_svm_score = settings['min_svm_score'] 
+   max_svm_score = settings['max_svm_score'] 
 
    h = Plf()
    d = Plf()
    a = Plf()
 
-   qualityPlifs = [None]*run['numQualPlifs']
-   donSP       = run['numDonSuppPoints']
-   accSP       = run['numAccSuppPoints']
-   lengthSP    = run['numLengthSuppPoints']
-   mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
-   totalQualSP = run['totalQualSuppPoints']
+   qualityPlifs = [None]*settings['numQualPlifs']
+   donSP       = settings['numDonSuppPoints']
+   accSP       = settings['numAccSuppPoints']
+   lengthSP    = settings['numLengthSuppPoints']
+   mmatrixSP   = settings['matchmatrixRows']*settings['matchmatrixCols']
+   totalQualSP = settings['totalQualSuppPoints']
 
    ####################
    # Gapfunktion
@@ -84,23 +84,23 @@ def set_param_palma(param, train_with_intronlengthinformation, run):
    mmatrix = numpy.matlib.mat(param[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP])
 
 
-   mmatrix.reshape(run['matchmatrixRows'],run['matchmatrixCols'])
+   mmatrix.reshape(settings['matchmatrixRows'],settings['matchmatrixCols'])
 
    ####################
    # Quality Plifs
    ####################
-   for idx in range(run['numQualPlifs']):
+   for idx in range(settings['numQualPlifs']):
       currentPlif = Plf()
-      currentPlif.limits    = linspace(run['min_qual'],run['max_qual'],run['numQualSuppPoints']) 
-      begin                 = lengthSP+donSP+accSP+mmatrixSP+(idx*run['numQualSuppPoints'])
-      end                   = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*run['numQualSuppPoints'])
+      currentPlif.limits    = linspace(settings['min_qual'],settings['max_qual'],settings['numQualSuppPoints']) 
+      begin                 = lengthSP+donSP+accSP+mmatrixSP+(idx*settings['numQualSuppPoints'])
+      end                   = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*settings['numQualSuppPoints'])
 
       currentPlif.penalties = param[begin:end].flatten().tolist()[0]
       currentPlif.transform = '' 
       currentPlif.name      = 'q' 
 
-      currentPlif.max_len   = run['max_qual'] 
-      currentPlif.min_len   = run['min_qual']
+      currentPlif.max_len   = settings['max_qual'] 
+      currentPlif.min_len   = settings['min_qual']
 
       qualityPlifs[idx] = currentPlif
 
index ac7194e..4df1500 100644 (file)
@@ -65,7 +65,8 @@ class System:
       print '\t\t\tStarting approximation...\n'
       print '#'*80
 
-      # 
+      # When we are given only genomic reads we first generate artificially spliced
+      # ones in order to generate a training set
       pre_task = TrainingPreprocessingTask(self.settings)
       pre_task.createJobs()
       pre_task.submit() 
index 3ca1d96..4c8a65b 100644 (file)
--- a/test.conf
+++ b/test.conf
@@ -48,13 +48,7 @@ global_log_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/first_test/qpalma.log
 # The parameter you want to do prediction with:
 #
 
-prediction_parameter_fn = /fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+/param_526.pickle
-
-#
-# The run object which should become at some point obsolete but is still needed for now
-#
-
-run_fn = /fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+/run_obj.pickle
+prediction_param_fn = /fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+/param_526.pickle
 
 #
 # In order to align short reads QPalma relies on additional information such as
@@ -93,24 +87,48 @@ half_window_size = 1500
 #
 #
    
-numLengthSuppPoints  = 10
-numDonSuppPoints     = 10
-numAccSuppPoints     = 10
-matchmatrixRows      = 10
-matchmatrixCols      = 10
-numQualSuppPoints    = 10
-totalQualSuppPoints  = 10
-
 optimizer = MOSEK
 optimizer = CPLEX
 optimizer = CVXOPT
 
 #
-# BLAT, ShoRe and mGene
+# The output format can be one of BLAT, ShoRe or mGene
 #
 
 output_format = BLAT
 
-
 #prb_offset = 50
 prb_offset = 64
+
+enable_quality_scores = True
+enable_splice_scores = True
+enable_intron_length = True
+
+C = 100
+
+remove_duplicate_scores = False
+
+matchmatrixCols         = 1
+matchmatrixRows         = 6
+numAccSuppPoints        = 10
+numConstraintsPerRound  = 50
+numDonSuppPoints        = 10
+numLengthSuppPoints     = 10
+numQualPlifs            = 30
+numQualSuppPoints       = 10
+
+anzpath                 = 2
+
+#enable_intron_length    = True
+#enable_quality_scores   = True
+#enable_splice_signals   = True
+#
+iter_steps              = 40
+max_intron_len          = 2000
+max_qual                = 40
+max_svm_score           = 1.0
+min_intron_len          = 20
+min_qual                = -5
+min_svm_score           = 0.0
+print_matrix            = False
+totalQualSuppPoints     = 300
diff --git a/tests/test_qpalma.py b/tests/test_qpalma.py
new file mode 100644 (file)
index 0000000..350e5a7
--- /dev/null
@@ -0,0 +1,279 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
+
+import cPickle
+import random
+import math
+import numpy
+import os.path
+import pdb
+import array
+import unittest
+
+from qpalma.qpalma_main import QPalma
+from qpalma.utils import print_prediction
+
+from qpalma.Run import Run
+
+from qpalma.OutputFormat import alignment_reconstruct
+
+from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_complement
+
+jp = os.path.join
+
+
+def createScoresForSequence(full_chromo_seq):
+   """
+   Given a genomic sequence this function calculates random scores for all
+   ocurring splice sites.
+   """
+
+   acc_pos = []
+   don_pos = []
+
+   # the first/last 205 pos do not have any predictions
+   for pos,elem in enumerate(full_chromo_seq[205:-205]):
+      if full_chromo_seq[pos-2:pos] == 'ag':
+         acc_pos.append(pos)
+      if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
+         don_pos.append(pos)
+
+   # make pos 1-based
+   acc_pos  = [e+1 for e in acc_pos]
+   don_pos  = [e+1 for e in don_pos]
+
+   acc_scores = [0.0]*len(acc_pos)
+   don_scores = [0.0]*len(don_pos)
+
+   for idx in range(len(acc_pos)):
+      acc_scores[idx] = random.uniform(0.1,1.0)
+   
+   for idx in range(len(don_pos)):
+      don_scores[idx] = random.uniform(0.1,1.0)
+
+   acc_pos = array.array('I',acc_pos)
+   acc_scores = array.array('f',acc_scores)
+
+   don_pos = array.array('I',don_pos)
+   don_scores = array.array('f',don_scores)
+
+   return acc_pos,acc_scores,don_pos,don_scores
+
+
+def saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,strand):
+   """
+   """
+
+   acc_score_fn   = 'test_data/acc/chromo_1%s.Conf'%strand
+   acc_pos_fn     = 'test_data/acc/chromo_1%s.pos'%strand
+   don_score_fn   = 'test_data/don/chromo_1%s.Conf'%strand
+   don_pos_fn     = 'test_data/don/chromo_1%s.pos'%strand
+
+   acc_scores.tofile(open(acc_score_fn, 'wb'))
+   acc_pos.tofile(open(acc_pos_fn, 'wb'))
+
+   don_scores.tofile(open(don_score_fn, 'wb'))
+   don_pos.tofile(open(don_pos_fn, 'wb'))
+
+
+
+def create_mini_chromosome():
+   
+   chromo_fn   = 'test_data/chromo1.flat'
+
+   chromo_fh = open(chromo_fn)
+   full_chromo_seq = chromo_fh.read()
+   full_chromo_seq = full_chromo_seq.strip()
+
+   # create data for forward strand
+   acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq)
+   saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'+')
+
+   # create data for reverse strand
+   full_chromo_seq_rev = reverse_complement(full_chromo_seq)
+   acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev)
+
+   # positions are always stored relative to positive strand
+   #acc_scores.reverse()
+   #acc_scores.
+
+   don_scores.reverse()
+
+   saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
+
+
+
+class TestQPalmaPrediction(unittest.TestCase):
+   """
+   This class...
+   """
+
+   def _setUp(self):
+      data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
+
+      self.prediction_set = cPickle.load(open(data_fn))
+
+      
+
+   def _setUp(self):
+      print
+      self.prediction_set = {}
+
+      #  xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
+      # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
+
+      read = 'tattttggaagttccaattcccttaatacatctcacag'
+      currentQualities = [[30]*len(read)]
+
+      id       = 1
+      chromo   = 3
+      strand   = '-'
+
+      tsize = 23470805
+
+      genomicSeq_start  = tsize - (2038+10)
+      genomicSeq_stop   = tsize - 1900 + 10
+      print 'Position: ',
+      print genomicSeq_start,genomicSeq_stop 
+
+      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
+
+      example = (currentSeqInfo,read,currentQualities)
+      self.prediction_set[id] = [example]
+
+      #  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx                                                                                        xxxxxxxx
+      # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
+
+      read = 'gctactgcaatcgagaagaagaagaagactatgaaata'
+      currentQualities = [[40]*30+[22]*8]
+
+      id       = 2
+      chromo   = 3
+      strand   = '+'
+
+      tsize = 23470805
+
+      genomicSeq_start  = tsize - (2038+10)
+      genomicSeq_stop   = tsize - 1900 + 10
+      print 'Position: ',
+      print genomicSeq_start,genomicSeq_stop 
+
+      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
+
+      example = (currentSeqInfo,read,currentQualities)
+      self.prediction_set[id] = [example]
+
+
+   def setUp(self):
+      self.prediction_set = {}
+
+      # chr1 +  20-120
+      read  = 'catctatgcaacagcattacagtgatcaccggcccaaaaaacctgtgtctggggttttgcctgatgatagcagtgatactgaaactggatcaatggtaag'
+      currentQualities = [[40]*len(read)]
+
+      id       = 3
+      chromo   = 1
+      strand   = '+'
+
+      genomicSeq_start  = 3500
+      genomicSeq_stop   = 6500
+      print 'Position: ',
+      print genomicSeq_start,genomicSeq_stop 
+
+      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
+
+      example = (currentSeqInfo,read,currentQualities)
+      self.prediction_set[id] = [example]
+
+      # chr1 - 5000-5100
+      read  = 'ctgtgtatctggttgctcaatatgctcgccggaaaatgaagatcatggatgctgtgagttctccttattgttcattatcaaactgatatgagtttctgat'
+      currentQualities = [[40]*len(read)]
+
+      id       = 4
+      chromo   = 1
+      strand   = '-'
+
+      total_size = 30432563
+
+      genomicSeq_start  = total_size - 6500
+      genomicSeq_stop   = total_size - 3500
+      print 'Position: ',
+      print genomicSeq_start,genomicSeq_stop 
+
+      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
+
+      example = (currentSeqInfo,read,currentQualities)
+      self.prediction_set[id] = [example]
+
+   
+   def testAlignments(self):
+      run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
+
+      run   = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
+      run['name'] = 'test_run'
+      run['result_dir']    = '/fml/ag-raetsch/home/fabio/tmp/sandbox/testcases'
+
+      param_fn = jp(run_dir,'param_526.pickle')
+      param = cPickle.load(open(param_fn))
+
+      print self.prediction_set
+      for example_key in self.prediction_set.keys():
+         print 'Current example %d' % example_key
+
+         for example in self.prediction_set[example_key]:
+            print example
+            print 'size'
+            print len(example)
+
+      # fetch the data needed
+      settings = {}
+
+      settings['genome_dir']           = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
+      settings['acceptor_scores_loc']  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      settings['donor_scores_loc']     = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+      settings['genome_file_fmt']      = 'chr%d.dna.flat'
+      settings['splice_score_file_fmt']= 'contig_%d%s'
+
+      allowed_fragments = [1]
+      settings['allowed_fragments'] = allowed_fragments
+
+      accessWrapper = DataAccessWrapper(settings)
+      seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
+
+      qp = QPalma(run,seqInfo,True)
+      allPredictions = qp.predict(self.prediction_set,param)
+
+      for current_prediction in allPredictions:
+         align_str = print_prediction(current_prediction)
+         print align_str
+
+         id = current_prediction['id']
+         seq         = current_prediction['read']
+         dna         = current_prediction['dna']
+         chromo      = current_prediction['chr']
+         strand      = current_prediction['strand']
+         start_pos   = current_prediction['start_pos']
+         predExons   = current_prediction['predExons']
+
+         numExons = int(math.ceil(len(predExons) / 2))
+         
+         print alignment_reconstruct(current_prediction,numExons)
+         print id,start_pos,predExons
+
+      print 'Problem counter is %d' % qp.problem_ctr 
+
+
+if __name__ == '__main__':
+   create_mini_chromosome()
+   #suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
+   #unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/tests/test_qpalma_prediction.py b/tests/test_qpalma_prediction.py
deleted file mode 100644 (file)
index 0e41bb2..0000000
+++ /dev/null
@@ -1,194 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*- 
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# Written (W) 2008 Fabio De Bona
-# Copyright (C) 2008 Max-Planck-Society
-
-import cPickle
-import math
-import numpy
-import os.path
-import pdb
-import unittest
-
-from qpalma.qpalma_main import QPalma
-from qpalma.utils import print_prediction
-
-from qpalma.Run import Run
-
-from qpalma.OutputFormat import alignment_reconstruct
-
-from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo
-
-jp = os.path.join
-
-
-class TestQPalmaPrediction(unittest.TestCase):
-   """
-   This class...
-   """
-
-   def _setUp(self):
-      data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
-
-      self.prediction_set = cPickle.load(open(data_fn))
-
-      
-
-   def _setUp(self):
-      print
-      self.prediction_set = {}
-
-      #  xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
-      # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
-
-      read = 'tattttggaagttccaattcccttaatacatctcacag'
-      currentQualities = [[30]*len(read)]
-
-      id       = 1
-      chromo   = 3
-      strand   = '-'
-
-      tsize = 23470805
-
-      genomicSeq_start  = tsize - (2038+10)
-      genomicSeq_stop   = tsize - 1900 + 10
-      print 'Position: ',
-      print genomicSeq_start,genomicSeq_stop 
-
-      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
-
-      example = (currentSeqInfo,read,currentQualities)
-      self.prediction_set[id] = [example]
-
-      #  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx                                                                                        xxxxxxxx
-      # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
-
-      read = 'gctactgcaatcgagaagaagaagaagactatgaaata'
-      currentQualities = [[40]*30+[22]*8]
-
-      id       = 2
-      chromo   = 3
-      strand   = '+'
-
-      tsize = 23470805
-
-      genomicSeq_start  = tsize - (2038+10)
-      genomicSeq_stop   = tsize - 1900 + 10
-      print 'Position: ',
-      print genomicSeq_start,genomicSeq_stop 
-
-      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
-
-      example = (currentSeqInfo,read,currentQualities)
-      self.prediction_set[id] = [example]
-
-
-   def setUp(self):
-      self.prediction_set = {}
-
-      # chr1 +  20-120
-      read  = 'catctatgcaacagcattacagtgatcaccggcccaaaaaacctgtgtctggggttttgcctgatgatagcagtgatactgaaactggatcaatggtaag'
-      currentQualities = [[40]*len(read)]
-
-      id       = 3
-      chromo   = 1
-      strand   = '+'
-
-      genomicSeq_start  = 3500
-      genomicSeq_stop   = 6500
-      print 'Position: ',
-      print genomicSeq_start,genomicSeq_stop 
-
-      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
-
-      example = (currentSeqInfo,read,currentQualities)
-      self.prediction_set[id] = [example]
-
-      # chr1 - 5000-5100
-      read  = 'ctgtgtatctggttgctcaatatgctcgccggaaaatgaagatcatggatgctgtgagttctccttattgttcattatcaaactgatatgagtttctgat'
-      currentQualities = [[40]*len(read)]
-
-      id       = 4
-      chromo   = 1
-      strand   = '-'
-
-      total_size = 30432563
-
-      genomicSeq_start  = total_size - 6500
-      genomicSeq_stop   = total_size - 3500
-      print 'Position: ',
-      print genomicSeq_start,genomicSeq_stop 
-
-      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
-
-      example = (currentSeqInfo,read,currentQualities)
-      self.prediction_set[id] = [example]
-
-   
-   def testAlignments(self):
-      run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
-
-      run   = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
-      run['name'] = 'test_run'
-      run['result_dir']    = '/fml/ag-raetsch/home/fabio/tmp/sandbox/testcases'
-
-      param_fn = jp(run_dir,'param_526.pickle')
-      param = cPickle.load(open(param_fn))
-
-      print self.prediction_set
-      for example_key in self.prediction_set.keys():
-         print 'Current example %d' % example_key
-
-         for example in self.prediction_set[example_key]:
-            print example
-            print 'size'
-            print len(example)
-
-      # fetch the data needed
-      settings = {}
-
-      settings['genome_dir']           = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
-      settings['acceptor_scores_loc']  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-      settings['donor_scores_loc']     = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
-
-      settings['genome_file_fmt']      = 'chr%d.dna.flat'
-      settings['splice_score_file_fmt']= 'contig_%d%s'
-
-      allowed_fragments = [1]
-      settings['allowed_fragments'] = allowed_fragments
-
-      accessWrapper = DataAccessWrapper(settings)
-      seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
-
-      qp = QPalma(run,seqInfo,True)
-      allPredictions = qp.predict(self.prediction_set,param)
-
-      for current_prediction in allPredictions:
-         align_str = print_prediction(current_prediction)
-         print align_str
-
-         id = current_prediction['id']
-         seq         = current_prediction['read']
-         dna         = current_prediction['dna']
-         chromo      = current_prediction['chr']
-         strand      = current_prediction['strand']
-         start_pos   = current_prediction['start_pos']
-         predExons   = current_prediction['predExons']
-
-         numExons = int(math.ceil(len(predExons) / 2))
-         
-         print alignment_reconstruct(current_prediction,numExons)
-         print id,start_pos,predExons
-
-      print 'Problem counter is %d' % qp.problem_ctr 
-
-
-if __name__ == '__main__':
-   suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
-   unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/tests/test_sequence_utils.py b/tests/test_sequence_utils.py
deleted file mode 100644 (file)
index 4c6483b..0000000
+++ /dev/null
@@ -1,276 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*- 
-
-import pdb
-import unittest
-import numpy
-
-from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement
-from qpalma.Lookup import LookupTable
-
-class TestSequenceUtils(unittest.TestCase):
-
-
-   def setUp(self):
-      self.strands = ['+','']
-
-      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
-
-      g_fmt = 'chr%d.dna.flat'
-      s_fmt = 'contig_%d%s'
-
-      num_chromo = 6
-
-      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
-      self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
-
-      print self.seqInfo.chromo_sizes
-
-      #self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,2))
-
-
-   def testThalianaDataExamples(self):
-      seq = 'TGAAAACAGGAACGGATTGGAGAAAGGCGTCTCGTCAT'.lower()
-      chromo = 3
-      strand = '+'
-      pos = 19246391
-
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-      
-      seq = 'AGGCAATGAAACTGATGCATTGGACTTGACGGGTGTTG'.lower()
-      chromo = 5
-      strand = '+'
-      pos = 15394760
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-
-      seq = 'TCTTGGTGGAGGAGCTAACACCGTAGCTGACGGTTACA'.lower()
-      chromo = 4
-      strand = '+'
-      pos = 16709206
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-
-      seq = 'TTGGAAGACAGAGTCAACCATACCCTTGCCTCTGGTGA'.lower()
-      chromo = 2
-      strand = '+'
-      pos = 16579649
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-
-      seq = 'CTGGCCAAAAGCTCAGGGAAGACGCAGCCTAGGGCTCC'.lower()
-      seq = reverse_complement(seq)
-      chromo = 1
-      strand = '-'
-      pos = 10475515
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-
-      seq = 'TTTTTCCCTTCTAGAAGACCGTAAAGGTAAACTTCTAA'.lower()
-      seq = reverse_complement(seq)
-      chromo = 3
-      strand = '-'
-      pos = 17143420
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-
-      seq = 'CACGGTGCAGATGAAGAACTGAGATCCGTTCGTGTTTG'.lower()
-      seq = reverse_complement(seq)
-      chromo = 4
-      strand = '-'
-      pos = 18083761
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-
-      window = 113
-      seq = 'CACGGTGCAGATGAAGAACTGAGATCCGTTCGTGTTTG'.lower()
-      seq = reverse_complement(seq)
-      chromo = 4
-      strand = '-'
-      pos = 18083761-window
-      dna,acc,don = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38+2*window,False)
-      self.assertEqual(seq,dna[window:-window])
-
-      #print dna
-      #print "".join(map(lambda x: ['_','x'][x!=-numpy.inf],acc))
-      #print "".join(map(lambda x: ['_','x'][x!=-numpy.inf],don))
-
-
-   def _testThalianaDataGeneric(self):
-
-      dna,acc,don = self.seqInfo.get_seq_and_scores(1,'+',1000,1369)
-      dna_,acc_,don_ = self.lt1.get_seq_and_scores(1,'+',1000,1369,'')
-
-      self.assertEqual(dna,dna_)
-      self.assertEqual(acc,acc_)
-      self.assertEqual(don,don_)
-
-      dna,acc,don = self.seqInfo.get_seq_and_scores(1,'-',1000,1369)
-      dna_,acc_,don_ = self.lt1.get_seq_and_scores(1,'-',1000,1369,'')
-
-      self.assertEqual(dna,dna_)
-      self.assertEqual(acc,acc_)
-      self.assertEqual(don,don_)
-
-      #dna,acc,don = seqInfo.get_seq_and_scores(2,'+',1000,1369)
-      #dna,acc,don = seqInfo.get_seq_and_scores(3,'+',1000,1369)
-      #dna,acc,don = seqInfo.get_seq_and_scores(4,'+',1000,1369)
-      #dna,acc,don = seqInfo.get_seq_and_scores(5,'+',1000,1369)
-
-      #dna,acc,don = seqInfo.get_seq_and_scores(1,'-',1000,1369)
-      #dna_,acc_,don_ = lt1.get_seq_and_scores(1,'-',1000,1369,'')
-
-      #self.assertEqual(dna,dna_)
-      #self.assertEqual(acc,acc_)
-      #self.assertEqual(don,don_)
-
-
-      #dna,acc,don = seqInfo.get_seq_and_scores(2,'-',1000,1369)
-      #dna,acc,don = seqInfo.get_seq_and_scores(3,'-',1000,1369)
-      #dna,acc,don = seqInfo.get_seq_and_scores(4,'-',1000,1369)
-      #dna,acc,don = seqInfo.get_seq_and_scores(5,'-',1000,1369)
-
-
-   def _testLyrataData(self):
-      g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
-      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
-      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
-
-      g_fmt = 'contig%d.dna.flat'
-      s_fmt = 'contig_%d%s'
-
-      num_chromo = 1099
-   
-      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
-      seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
-
-      dna,acc,don = seqInfo.get_seq_and_scores(1,'+',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(2,'+',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(3,'+',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(4,'+',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(5,'+',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(1,'-',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(2,'-',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(3,'-',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(4,'-',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(5,'-',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(45,'-',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(45,'+',1,1369)
-
-      print 'Finished'
-      #num_tests = 10
-      #for chromo in range(1,6):
-      #   for strand in ['+','-']:
-      #      for test_idx in range(num_tests):
-      #         if strand == '-':
-      #            size = seqInfo.chromo_sizes[chromo+7]
-      #         else:
-      #            size = seqInfo.chromo_sizes[chromo]
-      #         begin = random.randint(0,size)
-      #         end   = random.randint(b,size)
-      #         dna,acc,don = seqInfo.get_seq_and_scores(chromo,strand,begin,end)
-
-
-   def tearDown(self):
-      pass
-
-
-class TestLookupTable(unittest.TestCase):
-
-   def setUp(self):
-      pass
-
-
-   def testTableThalianaData(self):
-      """
-      """
-
-      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
-      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
-      g_fmt    = 'chr%d.dna.flat'
-      s_fmt    = 'contig_%d%s'
-
-      settings = {}
-      settings['genome_dir']           = g_dir
-      settings['acceptor_scores_loc']  = acc_dir
-      settings['donor_scores_loc']     = don_dir 
-
-      settings['genome_file_fmt']      = g_fmt
-      settings['splice_score_file_fmt']= s_fmt
-
-      settings['allowed_fragments']    = [1]
-
-      lt1 = LookupTable(settings)
-
-      accessWrapper = DataAccessWrapper(settings)
-      seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
-
-      seq = 'CTGGCCAAAAGCTCAGGGAAGACGCAGCCTAGGGCTCC'.lower()
-      seq = reverse_complement(seq)
-      chromo = 1
-      strand = '-'
-      pos = 10475515
-      dna = seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-
-      self.assertEqual(seq,dna)
-
-      full_info = lt1.get_seq_and_scores(chromo,strand,pos,pos+38)
-      dna = full_info[0]
-
-      self.assertEqual(seq,dna)
-
-      dna,acc,don = seqInfo.get_seq_and_scores(1,'+',1,1369)
-      dna_,acc_,don_ = lt1.get_seq_and_scores(1,'+',1,1369)
-
-
-
-   def _testTableLyrataData(self):
-      g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
-      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
-      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
-
-      g_fmt = 'contig%d.dna.flat'
-      s_fmt = 'contig_%d%s'
-
-      lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(0,1099))
-
-
-   def tearDown(self):
-      pass
-
-
-def check_wrapper():
-   g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
-   acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
-   don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
-
-   g_fmt = 'contig%d.dna.flat'
-   s_fmt = 'contig_%d%s.Conf_cum'
-
-   test = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
-
-   for idx in range(1,100):
-      pos = test.get_genomic_fragment_fn(idx,'+')
-      neg = test.get_genomic_fragment_fn(idx,'-')
-      print pos,neg
-      assert os.path.exists(pos)
-      assert os.path.exists(neg)
-
-      acc_fn,don_fn = test.get_splice_site_scores_fn(idx,'+')
-      print acc_fn,don_fn
-      assert os.path.exists(acc_fn)
-      assert os.path.exists(don_fn)
-      acc_fn,don_fn = test.get_splice_site_scores_fn(idx,'-')
-      print acc_fn,don_fn
-      assert os.path.exists(acc_fn)
-      assert os.path.exists(don_fn)
-
-
-if __name__ == '__main__':
-   #suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils)
-   suite = unittest.TestLoader().loadTestsFromTestCase(TestLookupTable)
-   unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644 (file)
index 0000000..4c6483b
--- /dev/null
@@ -0,0 +1,276 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+import pdb
+import unittest
+import numpy
+
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement
+from qpalma.Lookup import LookupTable
+
+class TestSequenceUtils(unittest.TestCase):
+
+
+   def setUp(self):
+      self.strands = ['+','']
+
+      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+      g_fmt = 'chr%d.dna.flat'
+      s_fmt = 'contig_%d%s'
+
+      num_chromo = 6
+
+      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+      self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
+      print self.seqInfo.chromo_sizes
+
+      #self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,2))
+
+
+   def testThalianaDataExamples(self):
+      seq = 'TGAAAACAGGAACGGATTGGAGAAAGGCGTCTCGTCAT'.lower()
+      chromo = 3
+      strand = '+'
+      pos = 19246391
+
+      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+      self.assertEqual(seq,dna)
+      
+      seq = 'AGGCAATGAAACTGATGCATTGGACTTGACGGGTGTTG'.lower()
+      chromo = 5
+      strand = '+'
+      pos = 15394760
+      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+      self.assertEqual(seq,dna)
+
+      seq = 'TCTTGGTGGAGGAGCTAACACCGTAGCTGACGGTTACA'.lower()
+      chromo = 4
+      strand = '+'
+      pos = 16709206
+      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+      self.assertEqual(seq,dna)
+
+      seq = 'TTGGAAGACAGAGTCAACCATACCCTTGCCTCTGGTGA'.lower()
+      chromo = 2
+      strand = '+'
+      pos = 16579649
+      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+      self.assertEqual(seq,dna)
+
+      seq = 'CTGGCCAAAAGCTCAGGGAAGACGCAGCCTAGGGCTCC'.lower()
+      seq = reverse_complement(seq)
+      chromo = 1
+      strand = '-'
+      pos = 10475515
+      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+      self.assertEqual(seq,dna)
+
+      seq = 'TTTTTCCCTTCTAGAAGACCGTAAAGGTAAACTTCTAA'.lower()
+      seq = reverse_complement(seq)
+      chromo = 3
+      strand = '-'
+      pos = 17143420
+      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+      self.assertEqual(seq,dna)
+
+      seq = 'CACGGTGCAGATGAAGAACTGAGATCCGTTCGTGTTTG'.lower()
+      seq = reverse_complement(seq)
+      chromo = 4
+      strand = '-'
+      pos = 18083761
+      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+      self.assertEqual(seq,dna)
+
+      window = 113
+      seq = 'CACGGTGCAGATGAAGAACTGAGATCCGTTCGTGTTTG'.lower()
+      seq = reverse_complement(seq)
+      chromo = 4
+      strand = '-'
+      pos = 18083761-window
+      dna,acc,don = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38+2*window,False)
+      self.assertEqual(seq,dna[window:-window])
+
+      #print dna
+      #print "".join(map(lambda x: ['_','x'][x!=-numpy.inf],acc))
+      #print "".join(map(lambda x: ['_','x'][x!=-numpy.inf],don))
+
+
+   def _testThalianaDataGeneric(self):
+
+      dna,acc,don = self.seqInfo.get_seq_and_scores(1,'+',1000,1369)
+      dna_,acc_,don_ = self.lt1.get_seq_and_scores(1,'+',1000,1369,'')
+
+      self.assertEqual(dna,dna_)
+      self.assertEqual(acc,acc_)
+      self.assertEqual(don,don_)
+
+      dna,acc,don = self.seqInfo.get_seq_and_scores(1,'-',1000,1369)
+      dna_,acc_,don_ = self.lt1.get_seq_and_scores(1,'-',1000,1369,'')
+
+      self.assertEqual(dna,dna_)
+      self.assertEqual(acc,acc_)
+      self.assertEqual(don,don_)
+
+      #dna,acc,don = seqInfo.get_seq_and_scores(2,'+',1000,1369)
+      #dna,acc,don = seqInfo.get_seq_and_scores(3,'+',1000,1369)
+      #dna,acc,don = seqInfo.get_seq_and_scores(4,'+',1000,1369)
+      #dna,acc,don = seqInfo.get_seq_and_scores(5,'+',1000,1369)
+
+      #dna,acc,don = seqInfo.get_seq_and_scores(1,'-',1000,1369)
+      #dna_,acc_,don_ = lt1.get_seq_and_scores(1,'-',1000,1369,'')
+
+      #self.assertEqual(dna,dna_)
+      #self.assertEqual(acc,acc_)
+      #self.assertEqual(don,don_)
+
+
+      #dna,acc,don = seqInfo.get_seq_and_scores(2,'-',1000,1369)
+      #dna,acc,don = seqInfo.get_seq_and_scores(3,'-',1000,1369)
+      #dna,acc,don = seqInfo.get_seq_and_scores(4,'-',1000,1369)
+      #dna,acc,don = seqInfo.get_seq_and_scores(5,'-',1000,1369)
+
+
+   def _testLyrataData(self):
+      g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
+      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
+      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+
+      g_fmt = 'contig%d.dna.flat'
+      s_fmt = 'contig_%d%s'
+
+      num_chromo = 1099
+   
+      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+      seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
+      dna,acc,don = seqInfo.get_seq_and_scores(1,'+',1,1369)
+      dna,acc,don = seqInfo.get_seq_and_scores(2,'+',1,1369)
+      dna,acc,don = seqInfo.get_seq_and_scores(3,'+',1,1369)
+      dna,acc,don = seqInfo.get_seq_and_scores(4,'+',1,1369)
+      dna,acc,don = seqInfo.get_seq_and_scores(5,'+',1,1369)
+      dna,acc,don = seqInfo.get_seq_and_scores(1,'-',1,1369)
+      dna,acc,don = seqInfo.get_seq_and_scores(2,'-',1,1369)
+      dna,acc,don = seqInfo.get_seq_and_scores(3,'-',1,1369)
+      dna,acc,don = seqInfo.get_seq_and_scores(4,'-',1,1369)
+      dna,acc,don = seqInfo.get_seq_and_scores(5,'-',1,1369)
+      dna,acc,don = seqInfo.get_seq_and_scores(45,'-',1,1369)
+      dna,acc,don = seqInfo.get_seq_and_scores(45,'+',1,1369)
+
+      print 'Finished'
+      #num_tests = 10
+      #for chromo in range(1,6):
+      #   for strand in ['+','-']:
+      #      for test_idx in range(num_tests):
+      #         if strand == '-':
+      #            size = seqInfo.chromo_sizes[chromo+7]
+      #         else:
+      #            size = seqInfo.chromo_sizes[chromo]
+      #         begin = random.randint(0,size)
+      #         end   = random.randint(b,size)
+      #         dna,acc,don = seqInfo.get_seq_and_scores(chromo,strand,begin,end)
+
+
+   def tearDown(self):
+      pass
+
+
+class TestLookupTable(unittest.TestCase):
+
+   def setUp(self):
+      pass
+
+
+   def testTableThalianaData(self):
+      """
+      """
+
+      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
+      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+      g_fmt    = 'chr%d.dna.flat'
+      s_fmt    = 'contig_%d%s'
+
+      settings = {}
+      settings['genome_dir']           = g_dir
+      settings['acceptor_scores_loc']  = acc_dir
+      settings['donor_scores_loc']     = don_dir 
+
+      settings['genome_file_fmt']      = g_fmt
+      settings['splice_score_file_fmt']= s_fmt
+
+      settings['allowed_fragments']    = [1]
+
+      lt1 = LookupTable(settings)
+
+      accessWrapper = DataAccessWrapper(settings)
+      seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+
+      seq = 'CTGGCCAAAAGCTCAGGGAAGACGCAGCCTAGGGCTCC'.lower()
+      seq = reverse_complement(seq)
+      chromo = 1
+      strand = '-'
+      pos = 10475515
+      dna = seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+
+      self.assertEqual(seq,dna)
+
+      full_info = lt1.get_seq_and_scores(chromo,strand,pos,pos+38)
+      dna = full_info[0]
+
+      self.assertEqual(seq,dna)
+
+      dna,acc,don = seqInfo.get_seq_and_scores(1,'+',1,1369)
+      dna_,acc_,don_ = lt1.get_seq_and_scores(1,'+',1,1369)
+
+
+
+   def _testTableLyrataData(self):
+      g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
+      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
+      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+
+      g_fmt = 'contig%d.dna.flat'
+      s_fmt = 'contig_%d%s'
+
+      lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(0,1099))
+
+
+   def tearDown(self):
+      pass
+
+
+def check_wrapper():
+   g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
+   acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
+   don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+
+   g_fmt = 'contig%d.dna.flat'
+   s_fmt = 'contig_%d%s.Conf_cum'
+
+   test = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+
+   for idx in range(1,100):
+      pos = test.get_genomic_fragment_fn(idx,'+')
+      neg = test.get_genomic_fragment_fn(idx,'-')
+      print pos,neg
+      assert os.path.exists(pos)
+      assert os.path.exists(neg)
+
+      acc_fn,don_fn = test.get_splice_site_scores_fn(idx,'+')
+      print acc_fn,don_fn
+      assert os.path.exists(acc_fn)
+      assert os.path.exists(don_fn)
+      acc_fn,don_fn = test.get_splice_site_scores_fn(idx,'-')
+      print acc_fn,don_fn
+      assert os.path.exists(acc_fn)
+      assert os.path.exists(don_fn)
+
+
+if __name__ == '__main__':
+   #suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils)
+   suite = unittest.TestLoader().loadTestsFromTestCase(TestLookupTable)
+   unittest.TextTestRunner(verbosity=2).run(suite)