+ extended documentation
authorFabio <fabio@congo.fml.local>
Mon, 13 Oct 2008 15:39:44 +0000 (17:39 +0200)
committerFabio <fabio@congo.fml.local>
Mon, 13 Oct 2008 15:39:44 +0000 (17:39 +0200)
+ fixed index issues in sequence utils
+ added small example script for splice site score creation

40 files changed:
doc/qpalma-manual.tex
qpalma/DatasetUtils.py
qpalma/Lookup.py
qpalma/OutputFormat.py
test.conf
tests/test_qpalma.py
tests/test_qpalma_installation.py
tests/test_utils.py
tools/calculateAlignmentQuality.m [deleted file]
tools/calculateSizes [deleted file]
tools/data_tools/Makefile [deleted file]
tools/data_tools/datastructures.c [deleted file]
tools/data_tools/datastructures.h [deleted file]
tools/data_tools/debug_tools.c [deleted file]
tools/data_tools/filterReads.c [deleted file]
tools/data_tools/filterReads.py [deleted file]
tools/data_tools/parser.c [deleted file]
tools/data_tools/read.h [deleted file]
tools/data_tools/sort_genes.c [deleted file]
tools/data_tools/tools.c [deleted file]
tools/dataset_scripts/countNotOnChr1-5_pos.py [deleted file]
tools/dataset_scripts/dataStatistic.py [deleted file]
tools/dataset_scripts/mappingInfo.sh [deleted file]
tools/evaluatePrediction.py [deleted file]
tools/extractESTs [deleted file]
tools/plot_param.py [deleted file]
tools/run_specific_scripts/training_dataset/compile_dataset.py [deleted file]
tools/run_specific_scripts/training_dataset/createNewDataset.py [deleted file]
tools/run_specific_scripts/transcriptome_analysis/README [deleted file]
tools/run_specific_scripts/transcriptome_analysis/combine_spliced_map_parts.sh [deleted file]
tools/run_specific_scripts/transcriptome_analysis/compare_predictions/Makefile [deleted file]
tools/run_specific_scripts/transcriptome_analysis/compare_predictions/compare.c [deleted file]
tools/run_specific_scripts/transcriptome_analysis/compare_predictions/gt_test [deleted file]
tools/run_specific_scripts/transcriptome_analysis/compare_predictions/pred_test [deleted file]
tools/run_specific_scripts/transcriptome_analysis/compile_dataset.py [deleted file]
tools/run_specific_scripts/transcriptome_analysis/createExonInfoForGenefinding.py [deleted file]
tools/run_specific_scripts/transcriptome_analysis/createFullMap.sh [deleted file]
tools/run_specific_scripts/transcriptome_analysis/createGenefindingInfo.sh [deleted file]
tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py [deleted file]
tools/spliceScoreConverter.py [new file with mode: 0644]

index 908dead..89c6e32 100644 (file)
@@ -7,6 +7,7 @@
 \newcommand{\QPA}{{\sl QPALMA alignment algorithm }}
 \newcommand{\QPH}{{\sl QPALMA approximation }}
 \newcommand{\QPP}{{\sl QPALMA pipeline }}
+\newcommand{\qparam}[1]{{\bf #1}}
 
 \title{QPalma Documentation}
 \author{Fabio De Bona}
 \section{Intro}
 
 \QP is an alignment tool targeted to align spliced reads produced by ``Next
-Generation'' sequencing platforms such as \emph{Illumina Genome Analyzer} or \emph{454}.
-The basic idea is to use an extended Smith-Waterman algorithm for local
-alignments that uses the base quality information of the reads directly in the
-alignment step. Optimal alignment parameters i.e. scoring matrices are inferred
-using a machine learning technique similar to \emph{Support Vector Machines}.
-For further details on \QP itself consult the paper \cite{DeBona08}. For
-details about the learning method see \cite{Tsochantaridis04}.
+Generation'' sequencing platforms such as \emph{Illumina Genome Analyzer} or
+\emph{454}.  The basic idea is to use an extended Smith-Waterman algorithm for
+local alignments that uses the base quality information of the reads directly
+in the alignment step. Optimal alignment parameters i.e. scoring matrices are
+inferred using a machine learning technique similar to \emph{Support Vector
+Machines}.  For further details on \QP itself consult the paper
+\cite{DeBona08}. For details about the learning method see
+\cite{Tsochantaridis04}.
 
 %
 %
 %
 \section{Installation}
 
-The following installation notes assume that you are working on a \emph{Linux}/*NIX
-environment. \emph{Windows} or \emph{MacOS} versions do not exist and are not planned. You
-should have a working \emph{DRMAA} (http://www.drmaa.org/) installation to
-submit jobs to a cluster. \QP is written in C++ and Python and tested on the
-\emph{gcc} compiler suite and python2.5.
+The following installation notes assume that you are working in a
+\emph{Linux}/*NIX environment. \emph{Windows} or \emph{Mac OS} versions do not
+exist and are not planned. You should have a working \emph{DRMAA}
+(http://www.drmaa.org/) installation to submit jobs to a cluster. \QP is
+written in C++ and Python and tested with the \emph{gcc} compiler suite and
+python2.5.
 
 \subsection{Dependencies}
 
-\QP was designed to be as self-contained as possible. However it still has some dependencies.
-In order to install \QP completely you will need:
+\QP was designed to be as self-contained as possible. However it still has some
+dependencies.  In order to install \QP completely you will need:
 \begin{itemize}
 \item $SWIG$, the simple wrapper interface generator,
 \item[$\rightarrow$] http://www.swig.org
@@ -70,84 +73,163 @@ For training \QP you need one of the following optimization toolkits:
 \end{enumerate}
 \noindent
 In order to check your installation you can run the script
-test\_qpalma\_installation.py which can be found in the directory tests/. This
-script either reports a successful installation or generates an error log file.
-You can send this error log file to qpalma@tuebingen.mpg.de.
-\\ \noindent
+test\_qpalma\_installation.py which can be found in the directory tests/ (Enter
+the directory first as the script works with relative paths). This script
+either reports a successful installation or generates an error log file.  You
+can send this error log file to qpalma@tuebingen.mpg.de.
+\\ \noindent 
 In order to make a full test run with sample data you can run the script
-test\_complete\_pipeline.py. This work on a small artificial dataset performs
-some checks.
+test\_complete\_pipeline.py. this scrips executes the full pipeline with data
+from a small artificial data set.
 
 %
 %
 %
 \section{Working with \QP}
 
-I assume now that you have a successful \QP installation. When working with \QP
-you usually only deal with two commands:
+We assume now that you have a successful \QP installation. When working with \QP
+you usually need two commands:
 
 \begin{itemize}
 \item python qpalma\_pipeline.py train example.conf
 \item python qpalma\_pipeline.py predict example.conf
 \end{itemize}
 \noindent
-\QP has two modes \emph{predict} and \emph{train} all settings are supplied in
+\QP has two modes \emph{predict} and \emph{train}. All settings are supplied in
 a configuration file here example.conf. \\ \noindent
 
+Basically \QP assumes that you have the following data:
+\begin{itemize}
+\item The reads you want to align together with their seed positions,
+\item genomic sequences,
+\item splice site scores predicted for the genomic sequences. 
+\end{itemize}
+
 A typical run consists of 
 \begin{enumerate}
+\item Find the seeds for your reads using another method (NOT VIA \QP). For example vmatch.
 \item Set your parameters (read size, number of mismatches, etc.) in the
 configuration files.
+\item Train a parameter vector once for your organism.
+\item Predict read alignments using the trained parameters.
 \end{enumerate}
-
-Basically \QP assumes that you have the following data:
-\begin{itemize}
-\item The reads you want to align,
-\item parts/full genomic sequences of an organism,
-\item splice site scores predicted for the genomic sequences.
-\end{itemize}
-
-\QP has one central configuration file: 
-
-Suppose you have a
-
-The project results directory (\emph{result\_dir}) contains then the subdirectories
-\begin{itemize}
-\item \emph{mapping} with subdirs main and spliced
-\item \emph{alignment} with subdirs for the different parameters and \emph{heuristic}
-\item \emph{remapping}
-\end{itemize}
+\noindent
+Note: To obtain splice site scores we use the predictions of another tool of
+our group \cite{SonnenburgSchweikertPhilipsBehrRaetsch07}. However this tool is
+not available yet but you can use any splice site prediction tool you prefer as
+long as you stick to the data format described below.
+\\ \\ \noindent
+\QP uses one central configuration file. The parameters you can set in this file are
+described below.
 
 \subsection{The configuration file}
 
-Via the variable ``result\_dir'' you can specify where all of QPalma's data should reside.
-This directory contains the following subdirectories:
+In order to run \QP successfully you will need to create a configuration file.
+All valid parameters for \QP are summarized below. You can also look at the
+example\_configuration.conf file which is included in the distribution.
+\\ \noindent
+The parameters do not have to be in a particular order. In order to simplify
+explanations we have grouped them according to their particular semantics.
+\\ \noindent
+Via the variable ``result\_dir'' you can specify where all of QPalma's data
+should reside. During initialization of \QP the following sub directories of
+result\_dir will be automatically created:
 \begin{itemize}
 \item preprocessing
 \item approximation
+\item dataset
 \item prediction
+\item alignment
 \item postprocessing, and 
-\item training
+\item training.
+\end{itemize}
+\noindent
+Each of these directories will hold all data created in a particular step of the
+\QP pipeline.  The final alignments can be found in the alignment directory.
+\\ \noindent
+The first group consists of general settings for \QP:
+\begin{itemize}
+\item \qparam{perform\_checks}- Enables some checks and debugging output.
+\item \qparam{platform} - IGA or $454$ for Illumina Genome Analyzer or Roche's $454$.
+\item \qparam{result\_dir} - This is the toplevel result directory.
+\item \qparam{spliced\_reads\_fn} - A file consisting of putative spliced reads (obtained for example via vmatch)
+\item \qparam{unspliced\_reads\_fn} - A file consisting of putative unspliced reads (obtained for example via vmatch)
+\item \qparam{num\_splits} - number of computing nodes
+\item \qparam{prediction\_param\_fn} - For prediction needed. This is the location of the trained parameter file.
+\end{itemize}
+\noindent
+The second group includes parameters needed for accessing the sequence data and
+the splice site predictions:
+\begin{itemize}
+\item \qparam{genome\_dir} - The location of the genomic sequence files.
+\item \qparam{acceptor\_scores\_loc} - The location of the acceptor scores files.
+\item \qparam{donor\_scores\_loc} - The location of the acceptor scores files.
+\item \qparam{genome\_file\_fmt} - A format string describing how valid file names are generated. See description below.
+\item \qparam{splice\_score\_file\_fmt} - A format string describing how valid file names for the splice site scores are generated. 
+\item \qparam{allowed\_fragments} - A list of the form ``[1,2,4]'' describing the valid file name numbers. See description below.
+\item \qparam{half\_window\_size} - Given an alignment seed position for a
+given read we cut out the area [1500-seed\_pos,seed\_pos+1500].
+\item \qparam{output\_format} - The output format can be either blat, ShoRe or
+mGene.
+\item \qparam{prb\_offset} - We expect the quality values to be saved as ascii
+strings so the quality is calulated via ord(qchar)-prb\_offset ($ord(\cdot)$
+returns the numeric ascii value). For example an 'h' would correspond to
+quality $40$.
+\end{itemize}
+\noindent
+The genomic sequence and the splice site scores file are accessed using the
+format strings given above. This works as follows: Suppose we have two
+chromosomes 1 \& 4 we want to align to. 
+The sequences are in flat files:
+\begin{center}
+/home/fabio/genomic\_sequence/chromosome\_1.flat \\ \noindent
+/home/fabio/genomic\_sequence/chromosome\_4.flat
+\end{center}
+\noindent
+Then we set genome\_dir to \emph{/home/fabio/genomic\_sequence},
+genome\_file\_fmt to ``chromosome\_\%d.flat'' and allowed\_fragments to [1,4].
+\\ \noindent
+Parameters needed for training are:
+\begin{itemize}
+\item C - this is a parameter trading off between loss term and regularization (similar to C in SVM training).
+\item enable\_quality\_scores - You can enable or disable the use of quality
+scores via settings this parameter to True or False
+\item enable\_splice\_scores - You can enable or disable the use of quality
+scores via settings this parameter to True or False
+\item enable\_intron\_length - You can enable or disable the use of quality
+scores via settings this parameter to True or False
+\item optimizer - Either one of CVXOPT, MOSEK or CPLEX.
+\item numConstraintsPerRound - How many constraints per optimizer call should be generated (if unsure set to $50$).
+\end{itemize}
+\noindent
+For training and prediction purposes you can set the number of support points
+for the piecewise linear functions of the scoring matrix. This effectively
+changes the total number of parameters of the model. That means that you can
+only predict with a parameter vector which was trained with the same settings.
+(We refer to piecewise linear functions as \emph{plifs}.)
+\begin{itemize}
+\item numAccSuppPoints - The number of support points for the plif scoring the acceptor scores.
+\item numDonSuppPoints - The number of support points for the plif scoring the acceptor scores.
+\item numLengthSuppPoints - The number of support points for the plif scoring the acceptor scores.
+\item min\_intron\_len,max\_intron\_len - Set this to the range of minimum and maximum intron lengths.
+\item min\_qual and max\_qual - Set this to the range of possible quality values. (Illumina GA usually [-5,40]).
+\item min\_svm\_score and max\_svm\_score - As we work with normalized (in the
+range [0.0,1.0]) splice site scores this is usually $0.0$ resp. $1.0$.
+\end{itemize}
+\noindent
+The following last group of parameters includes ``low-level'' parameters. Just
+keep the default values of these parameters. Possible extensions of \QP may use
+them differently.
+\begin{itemize}
+\item remove\_duplicate\_scores
+\item print\_matrix
+\item matchmatrixCols, matchmatrixRows
+\item numQualPlifs
+\item numQualSuppPoints
+\item totalQualSuppPoints
+\item anzpath
+\item iter\_steps
 \end{itemize}
-
-%
-%
-%
-\section{Pipeline}
-
-The full pipline consists of $n$ steps:
-
-\begin{enumerate}
-\item Find alignment seeds using a fast suffix array method (vmatch) for
-all given reads. This may take several rounds for subsets of the reads.
-\item Preprocess the reads and their seeds. Convert them to a qpalma format with some sanity checks.
-\item Use the \QPH to identify those reads that have a full seed but might be
-spliced anyways.
-\item Once we identified all potentially spliced reads we use \QPA to align
-those to their seed regions.
-\item One can choose between several post-processing steps in order to refine
-the quality of the alignments via filtering.
-\end{enumerate}
 
 %
 %
@@ -159,8 +241,8 @@ by the users in order to make \QP work.
 
 \subsection{Format of the configuration file}
 
-The configuration file includes are settings \QP needs to perform an analysis.
-This includes paths to file where the raw data exists as well as settings which
+The configuration file includes all settings \QP needs to perform an analysis.
+This includes paths to files where the raw data exists as well as settings which
 sequencing platform is being used,the number of cluster nodes to employ etc. .
 
 Its values are in the form 
@@ -169,37 +251,45 @@ key = value
 \end{center}
 and ``\#'' for lines containing comments.
 
+
 \subsection{Read format and internal representation}
 
-The format of the file containing the mapped short reads is as follows.  Each
-line corresponds to one short read. Each line has six tab-separated entries,
-namely:
+The read input files for \QP contain the read sequences with their quality as
+well as some information from the first mapping / seed region finding. The
+format of the file containing the mapped short reads is as follows.  Each line
+corresponds to one short read. Each line has six tab-separated entries, namely:
 
 \begin{enumerate}
 \item unique read id
 \item chromosome/contig id
-\item position of match in chromosome/contig
+\item position of match in chromosome/contig (0-based, relative to positive strand)
 \item strand 
 \item read sequence (in strand specific direction)
 \item read quality (in strand specific direction)
 \end{enumerate}
 
 Strand specific direction means that \QP assumes that the reads are already in
-their true orientation and the qualities as well. Internally there is no
-reverse complementing taking place.
+their true orientation and the qualities as well.
 
 \subsection{Splice Scores}
 
-The splice site scores where generated by the software... . If you would like
+As mentioned before the splice site scores where generated using a tool
+described in \cite{SonnenburgSchweikertPhilipsBehrRaetsch07}. If you would like
 to use your own splice site predictions you can create files according to the
-format \QP uses.  This splice site scores format is described. For each
-canonical acceptor ($AG$) and donor site ($GT$/$GC$) \QP expects a score. For every
-chromosome or contig we have a four files.  For each strand we have a binary
-file containing the positions and a binary file containing the scores. The
-positions are stored as unsigned values and the scores as floats.  The
-positions are 1-based and the assignment of positions and their scores is as
-follows: The acceptor score positions are the positions right after the $AG$ and the
-donor score positions are the positions right on the $G$ of the $GT$. For example:
+format \QP uses.  This splice site scores format is described below:
+\\ \noindent
+For each canonical acceptor ($AG$) and donor site ($GT$/$GC$) \QP expects a
+score. The data is organized in files for each signal (acc/don) for each strand
+(+/-).  The information on positions of canonical splice sites and the
+corresponding scores lies in separate files.  Every chromosome or contig leads
+then to $8$ files (acc/don, +/- and pos/score).
+The position and score files are raw binary files containing the orderd positions
+and the scores. The positions are stored as unsigned values and the scores as
+floats. Note that you have to be careful when working in an inhomogeneous
+cluster environment (endianness, data type size ). The positions are 1-based
+and the assignment of positions and their scores is as follows: The acceptor
+score positions are the positions right after the $AG$ and the donor score
+positions are the positions right on the $G$ of the $GT$.  For example:
 \begin{center}
 \begin{tabular}{ccccccccccc}
 ... & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 & 11 & ... \\ 
@@ -207,8 +297,9 @@ donor score positions are the positions right on the $G$ of the $GT$. For exampl
 ... &   & 0.2&  &   &   &   &   &    & 0.3 & ... 
 \end{tabular}
 \end{center}
-We supply a script  for conversion of ascii to binary files. You can use this
-script as a template to make your own scoring information files.
+We supply a script ``spliceScoresConverter.py'' for conversion of ascii to
+binary files. You can use this script as a template to make your own scoring
+information files.
 
 \section{Remarks}
 
@@ -234,6 +325,11 @@ The official \QP project email address is:
    \newblock Support Vector Machine Learning for Interdependent and Sturcutured Output Spaces
    \newblock {\em Proceedings of the 16th International Conference on Machine Learning}, 2004
 
+\bibitem[3]{SonnenburgSchweikertPhilipsBehrRaetsch07}
+   S{\"o}ren~Sonnenburg~and~Gabriele~Schweikert~and~Petra~Philips~and~Jonas~Behr~and~Gunnar~R{\"a}tsch
+   \newblock Accurate splice site prediction using support vector machines
+   \newblock {\em BMC Bioinformatics 2007, 8(Suppl 10):S7}, December 2007
+
 \end{thebibliography}
 %
 %
index 981455c..669d619 100644 (file)
@@ -80,7 +80,7 @@ def addToDataset(map_file,dataset,settings):
       else:
          us_offset = pos - 1 
 
-      if pos+half_window_size < seqInfo.chromo_sizes[chromo]:
+      if pos+half_window_size < seqInfo.getFragmentSize(chromo):
          ds_offset = half_window_size
       else:
          ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
index bc36f96..5d3b7df 100644 (file)
@@ -90,7 +90,7 @@ class LookupTable:
 
       strand = '+'
       genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,False,False)
-      print len(currentAcc),len(currentDon)
+      #print len(currentAcc),len(currentDon)
       genomicSeq = genomicSeq.lower()
 
       chromo_idx = self.chromo_map[chromo]
@@ -102,7 +102,7 @@ class LookupTable:
 
       strand = '-'
       genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,False,False)
-      print len(currentAcc),len(currentDon)
+      #print len(currentAcc),len(currentDon)
       genomicSeq = genomicSeq.lower()
 
       newCurrentAcc = [-inf] 
@@ -111,7 +111,6 @@ class LookupTable:
       newCurrentDon = [-inf] 
       newCurrentDon.extend(currentDon[:-1])
       currentDon = newCurrentDon
-      print len(currentAcc),len(currentDon)
 
       self.acceptorScoresNeg[chromo_idx]   = currentAcc
       self.donorScoresNeg[chromo_idx]      = currentDon
index d9ba9b0..1939791 100644 (file)
@@ -110,8 +110,8 @@ def recalculatePositions(chromo,start_pos,strand,starts,ends,seqInfo,window_size
       start_pos = total_size - start_pos
 
    if strand == '+':
-      starts   = [int(elem) + start_pos-2 for elem in starts]
-      ends     = [int(elem) + start_pos-2 for elem in ends]
+      starts   = [int(elem) + start_pos-1 for elem in starts]
+      ends     = [int(elem) + start_pos-1 for elem in ends]
    else:
       starts   = [int(elem) + start_pos for elem in starts]
       ends     = [int(elem) + start_pos for elem in ends]
@@ -147,8 +147,8 @@ def createAlignmentOutput(settings,chunk_fn,result_fn):
    written_lines = 0
    for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
 
-      if output_format == 'BLAT':
-         # BLAT output
+      if output_format == 'blat':
+         # blat output
          new_line = createBlatOutput(current_prediction,settings)
 
       elif output_format == 'ShoRe':
@@ -171,7 +171,7 @@ def createAlignmentOutput(settings,chunk_fn,result_fn):
 def createBlatOutput(current_prediction,settings):
    """
    This function takes one prediction as input and returns the corresponding
-   alignment in BLAT format:
+   alignment in blat format:
 
     1. matches     - Number of bases that match that aren't repeats
     2. misMatches  - Number of bases that don't match
index 39e2b0c..6be32a6 100644 (file)
--- a/test.conf
+++ b/test.conf
@@ -89,7 +89,7 @@ optimizer = CVXOPT
 # The output format can be one of BLAT, ShoRe or mGene
 #
 
-output_format = BLAT
+output_format = blat
 
 #prb_offset = 50
 prb_offset = 64
index 5f62f40..ba87a16 100644 (file)
@@ -14,6 +14,7 @@ import cPickle
 import random
 import math
 import numpy
+from numpy import inf
 import os.path
 import pdb
 import array
@@ -350,20 +351,43 @@ class TestQPalmaPrediction(unittest.TestCase):
 
       print 'Problem counter is %d' % qp.problem_ctr 
 
+def check_reverse_strand_calculation(id,b,e,seqInfo):
+   
+   seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
 
-def simple_check():
-   # fetch the data needed
-   settings = {}
+   total_size = seqInfo.getFragmentSize(1)
+   bp = total_size - e
+   ep = total_size - b
+   seqp,acc,don  = seqInfo.get_seq_and_scores(id,'+',bp,ep,only_seq=False,perform_checks=False)
+   seqp = reverse_complement(seqp)
+
+   return (seq == seqp)
+
+
+def simple_check(settings,seqInfo,lt1):
+
+   print 'Checking sequences for some intervals...'
+   
+   intervals = [(0,10000),(545,874),(999,1234)]
+
+   for (b,e) in intervals:
+      print check_reverse_strand_calculation(1,b,e,seqInfo)
+
+   for (b,e) in [(206,874),(545,874),(999,1234)]:
+      lt1 = LookupTable(settings)
+      _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
+      dna,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=False)
+
+      print dna == _dna
+      print acc == _acc
+      print don == _don
 
-   #settings['genome_dir']           = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
-   #settings['acceptor_scores_loc']  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-   #settings['donor_scores_loc']     = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+      #print [p for p,e in enumerate(acc) if e != -inf]
+      #print [p for p,e in enumerate(_acc) if e != -inf]
 
-   #settings['genome_file_fmt']      = 'chr%d.dna.flat'
-   #settings['splice_score_file_fmt']= 'contig_%d%s'
 
-   #allowed_fragments = [1]
-   #settings['allowed_fragments'] = allowed_fragments
+def checks():
+   settings = {}
 
    settings['genome_dir']           = 'test_data/'
    settings['acceptor_scores_loc']  = 'test_data/acc'
@@ -376,48 +400,39 @@ def simple_check():
    settings['allowed_fragments'] = allowed_fragments
 
    accessWrapper = DataAccessWrapper(settings)
-   seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
+   seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+   lt1 = LookupTable(settings)
 
-   b = 0
-   e = 10000
-   seq = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=True,perform_checks=False)
+   print 'Checking with toy data...'
+   simple_check(settings,seqInfo,lt1)
 
-   total_size = seqInfo.getFragmentSize(1)
-   bp = total_size - e
-   ep = total_size - b
-   seqp = seqInfo.get_seq_and_scores(1,'+',b,e,only_seq=True,perform_checks=False)
-   seqp = reverse_complement(seqp)
+   settings = {}
 
-   print seq[0:100] == seqp[0:100]
-   print seq[534:1652] == seqp[534:1652]
-   print seq[-100:] == seqp[-100:]
+   settings['genome_dir']           = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
+   settings['acceptor_scores_loc']  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+   settings['donor_scores_loc']     = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
 
-   lt1 = LookupTable(settings)
-   dna,acc,don= lt1.get_seq_and_scores(1,'-',b,e)
+   settings['genome_file_fmt']      = 'chr%d.dna.flat'
+   settings['splice_score_file_fmt']= 'contig_%d%s'
 
-   print seq[:100]  == seqp[:100]  == dna[:100]
-   print seq[-100:] == seqp[-100:] == dna[-100:]
+   allowed_fragments = [1]
+   settings['allowed_fragments'] = allowed_fragments
 
-   b = 206
-   e = 300
-   dna,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=False)
-   
+   accessWrapper = DataAccessWrapper(settings)
+   seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
    lt1 = LookupTable(settings)
-   _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
-
-
-   print dna == _dna
-   print acc == _acc
-   print don == _don
 
-   from numpy import inf
-   print [p for p,e in enumerate(acc) if e != -inf]
-   print [p for p,e in enumerate(_acc) if e != -inf]
+   print 'Checking with real data...'
+   simple_check(settings,seqInfo,lt1)
 
 
+def run():
+   print 'Creating some artifical data...'
+   create_mini_chromosome()
+   print 'Performing some checks...'
+   checks()
 
 if __name__ == '__main__':
-   #create_mini_chromosome()
-   simple_check()
+   run()
    #suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
    #unittest.TextTestRunner(verbosity=2).run(suite)
index 6a3af01..b47d4a6 100644 (file)
@@ -20,7 +20,7 @@ import unittest
 
 from test_approximation import TestApproximation
 from test_qpalma import TestQPalmaPrediction
-from test_utils import TestSequenceUtils,TestLookupTable
+from test_utils import TestSequenceUtils
 
 SUCCESS = True
 
@@ -72,7 +72,7 @@ if __name__ == '__main__':
       out_fh.write(line+'\n')
 
    # after checking the modules we run some simple testcases on QPalma.
-   data_suite = unittest.TestLoader().loadTestsFromTestCase(TestLookupTable)
+   data_suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils)
    approximation_suite = unittest.TestLoader().loadTestsFromTestCase(TestApproximation)
    prediction_suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
 
index 4c6483b..1db32fe 100644 (file)
 #!/usr/bin/env python 
 # -*- coding: utf-8 -*- 
 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
+
+import cPickle
+import random
+import math
+import numpy
+from numpy import inf
+import os.path
 import pdb
+import array
 import unittest
-import numpy
 
-from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,reverse_complement
+from qpalma.qpalma_main import QPalma
+from qpalma.utils import print_prediction
+
+from qpalma.Run import Run
+
+from qpalma.SettingsParser import parseSettings
+from qpalma.OutputFormat import alignment_reconstruct
+from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_complement
+
 from qpalma.Lookup import LookupTable
 
-class TestSequenceUtils(unittest.TestCase):
+jp = os.path.join
 
 
-   def setUp(self):
-      self.strands = ['+','']
+def createScoresForSequence(full_chromo_seq,reverse_strand=False):
+   """
+   Given a genomic sequence this function calculates random scores for all
+   ocurring splice sites.
+   """
 
-      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+   acc_pos = []
+   don_pos = []
 
-      g_fmt = 'chr%d.dna.flat'
-      s_fmt = 'contig_%d%s'
+   total_size = len(full_chromo_seq)
 
-      num_chromo = 6
+   # the first/last 205 pos do not have any predictions
+   for pos,elem in enumerate(full_chromo_seq[205:-205]):
+      if full_chromo_seq[pos-2:pos] == 'ag':
+         acc_pos.append(pos)
+      if full_chromo_seq[pos:pos+2] == 'gt' or full_chromo_seq[pos:pos+2] == 'gc':
+         don_pos.append(pos)
 
-      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
-      self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+   # make pos 1-based
+   acc_pos  = [e+1 for e in acc_pos]
+   don_pos  = [e+1 for e in don_pos]
 
-      print self.seqInfo.chromo_sizes
+   acc_scores = [0.0]*len(acc_pos)
+   don_scores = [0.0]*len(don_pos)
 
-      #self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(1,2))
+   for idx in range(len(acc_pos)):
+      acc_scores[idx] = random.uniform(0.1,1.0)
+   
+   for idx in range(len(don_pos)):
+      don_scores[idx] = random.uniform(0.1,1.0)
 
+   # recalculate indices and reverse them in order to have positions relative
+   # to positive strand
+   if reverse_strand:
+      acc_pos = [total_size-1-e for e in acc_pos]
+      acc_pos.reverse()
+      acc_scores.reverse()
 
-   def testThalianaDataExamples(self):
-      seq = 'TGAAAACAGGAACGGATTGGAGAAAGGCGTCTCGTCAT'.lower()
-      chromo = 3
-      strand = '+'
-      pos = 19246391
+      don_pos = [total_size-1-e for e in don_pos]
+      don_pos.reverse()
+      don_scores.reverse()
 
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-      
-      seq = 'AGGCAATGAAACTGATGCATTGGACTTGACGGGTGTTG'.lower()
-      chromo = 5
-      strand = '+'
-      pos = 15394760
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-
-      seq = 'TCTTGGTGGAGGAGCTAACACCGTAGCTGACGGTTACA'.lower()
-      chromo = 4
-      strand = '+'
-      pos = 16709206
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-
-      seq = 'TTGGAAGACAGAGTCAACCATACCCTTGCCTCTGGTGA'.lower()
-      chromo = 2
-      strand = '+'
-      pos = 16579649
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-
-      seq = 'CTGGCCAAAAGCTCAGGGAAGACGCAGCCTAGGGCTCC'.lower()
-      seq = reverse_complement(seq)
-      chromo = 1
-      strand = '-'
-      pos = 10475515
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-
-      seq = 'TTTTTCCCTTCTAGAAGACCGTAAAGGTAAACTTCTAA'.lower()
-      seq = reverse_complement(seq)
-      chromo = 3
-      strand = '-'
-      pos = 17143420
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-
-      seq = 'CACGGTGCAGATGAAGAACTGAGATCCGTTCGTGTTTG'.lower()
-      seq = reverse_complement(seq)
-      chromo = 4
-      strand = '-'
-      pos = 18083761
-      dna = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
-      self.assertEqual(seq,dna)
-
-      window = 113
-      seq = 'CACGGTGCAGATGAAGAACTGAGATCCGTTCGTGTTTG'.lower()
-      seq = reverse_complement(seq)
-      chromo = 4
-      strand = '-'
-      pos = 18083761-window
-      dna,acc,don = self.seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38+2*window,False)
-      self.assertEqual(seq,dna[window:-window])
-
-      #print dna
-      #print "".join(map(lambda x: ['_','x'][x!=-numpy.inf],acc))
-      #print "".join(map(lambda x: ['_','x'][x!=-numpy.inf],don))
-
-
-   def _testThalianaDataGeneric(self):
-
-      dna,acc,don = self.seqInfo.get_seq_and_scores(1,'+',1000,1369)
-      dna_,acc_,don_ = self.lt1.get_seq_and_scores(1,'+',1000,1369,'')
-
-      self.assertEqual(dna,dna_)
-      self.assertEqual(acc,acc_)
-      self.assertEqual(don,don_)
-
-      dna,acc,don = self.seqInfo.get_seq_and_scores(1,'-',1000,1369)
-      dna_,acc_,don_ = self.lt1.get_seq_and_scores(1,'-',1000,1369,'')
-
-      self.assertEqual(dna,dna_)
-      self.assertEqual(acc,acc_)
-      self.assertEqual(don,don_)
-
-      #dna,acc,don = seqInfo.get_seq_and_scores(2,'+',1000,1369)
-      #dna,acc,don = seqInfo.get_seq_and_scores(3,'+',1000,1369)
-      #dna,acc,don = seqInfo.get_seq_and_scores(4,'+',1000,1369)
-      #dna,acc,don = seqInfo.get_seq_and_scores(5,'+',1000,1369)
-
-      #dna,acc,don = seqInfo.get_seq_and_scores(1,'-',1000,1369)
-      #dna_,acc_,don_ = lt1.get_seq_and_scores(1,'-',1000,1369,'')
-
-      #self.assertEqual(dna,dna_)
-      #self.assertEqual(acc,acc_)
-      #self.assertEqual(don,don_)
-
-
-      #dna,acc,don = seqInfo.get_seq_and_scores(2,'-',1000,1369)
-      #dna,acc,don = seqInfo.get_seq_and_scores(3,'-',1000,1369)
-      #dna,acc,don = seqInfo.get_seq_and_scores(4,'-',1000,1369)
-      #dna,acc,don = seqInfo.get_seq_and_scores(5,'-',1000,1369)
-
-
-   def _testLyrataData(self):
-      g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
-      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
-      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
-
-      g_fmt = 'contig%d.dna.flat'
-      s_fmt = 'contig_%d%s'
-
-      num_chromo = 1099
+   acc_pos = array.array('I',acc_pos)
+   acc_scores = array.array('f',acc_scores)
+
+   don_pos = array.array('I',don_pos)
+   don_scores = array.array('f',don_scores)
+
+   return acc_pos,acc_scores,don_pos,don_scores
+
+
+def saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,strand):
+   """
+   """
+
+   acc_score_fn   = 'test_data/acc/chromo_1%s.Conf'%strand
+   acc_pos_fn     = 'test_data/acc/chromo_1%s.pos'%strand
+   don_score_fn   = 'test_data/don/chromo_1%s.Conf'%strand
+   don_pos_fn     = 'test_data/don/chromo_1%s.pos'%strand
+
+   acc_scores.tofile(open(acc_score_fn, 'wb'))
+   acc_pos.tofile(open(acc_pos_fn, 'wb'))
+
+   don_scores.tofile(open(don_score_fn, 'wb'))
+   don_pos.tofile(open(don_pos_fn, 'wb'))
+
+
+def create_mini_chromosome():
    
-      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
-      seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
-
-      dna,acc,don = seqInfo.get_seq_and_scores(1,'+',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(2,'+',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(3,'+',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(4,'+',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(5,'+',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(1,'-',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(2,'-',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(3,'-',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(4,'-',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(5,'-',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(45,'-',1,1369)
-      dna,acc,don = seqInfo.get_seq_and_scores(45,'+',1,1369)
-
-      print 'Finished'
-      #num_tests = 10
-      #for chromo in range(1,6):
-      #   for strand in ['+','-']:
-      #      for test_idx in range(num_tests):
-      #         if strand == '-':
-      #            size = seqInfo.chromo_sizes[chromo+7]
-      #         else:
-      #            size = seqInfo.chromo_sizes[chromo]
-      #         begin = random.randint(0,size)
-      #         end   = random.randint(b,size)
-      #         dna,acc,don = seqInfo.get_seq_and_scores(chromo,strand,begin,end)
-
-
-   def tearDown(self):
-      pass
-
-
-class TestLookupTable(unittest.TestCase):
+   chromo_fn   = 'test_data/chromo1.flat'
 
-   def setUp(self):
-      pass
+   chromo_fh = open(chromo_fn)
+   full_chromo_seq = chromo_fh.read()
+   full_chromo_seq = full_chromo_seq.strip()
 
+   print full_chromo_seq[:200]
 
-   def testTableThalianaData(self):
-      """
-      """
+   # create data for forward strand
+   acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq,reverse_strand=False)
+   print acc_pos[:5]
+   print don_pos[:5]
+   saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'+')
 
-      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
-      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
-      g_fmt    = 'chr%d.dna.flat'
-      s_fmt    = 'contig_%d%s'
+   # create data for reverse strand
+   full_chromo_seq_rev = reverse_complement(full_chromo_seq)
 
-      settings = {}
-      settings['genome_dir']           = g_dir
-      settings['acceptor_scores_loc']  = acc_dir
-      settings['donor_scores_loc']     = don_dir 
+   total_size = len(full_chromo_seq_rev)
 
-      settings['genome_file_fmt']      = g_fmt
-      settings['splice_score_file_fmt']= s_fmt
+   print full_chromo_seq_rev[:200]
+   acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev,reverse_strand=True)
+   acc_pos = [total_size-1-e for e in acc_pos]
+   acc_pos.reverse()
+   print acc_pos[:5]
+   don_pos = [total_size-1-e for e in don_pos]
+   don_pos.reverse()
+   print don_pos[:5]
 
-      settings['allowed_fragments']    = [1]
+   #
+   # Remember: The positions are always saved one-based.
+   #
 
-      lt1 = LookupTable(settings)
+   #saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
 
-      accessWrapper = DataAccessWrapper(settings)
-      seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+   a = 103
+   b = 255
+   print 'pos: %s'%full_chromo_seq[a:b]
+   print 'neg: %s'%full_chromo_seq_rev[a:b]
+
+   total_size = len(full_chromo_seq)
+   ap = total_size-b
+   bp = total_size-a
+   print 'png: %s'%reverse_complement(full_chromo_seq[ap:bp])
+
+class TestSequenceUtils(unittest.TestCase):
+
+   def setUp(self):
+      create_mini_chromosome()
+
+
+   def check_reverse_strand_calculation(self,id,b,e,seqInfo):
+      
+      seq,acc,don = seqInfo.get_seq_and_scores(id,'-',b,e,only_seq=False,perform_checks=False)
 
-      seq = 'CTGGCCAAAAGCTCAGGGAAGACGCAGCCTAGGGCTCC'.lower()
-      seq = reverse_complement(seq)
-      chromo = 1
-      strand = '-'
-      pos = 10475515
-      dna = seqInfo.get_seq_and_scores(chromo,strand,pos,pos+38,True)
+      total_size = seqInfo.getFragmentSize(1)
+      bp = total_size - e
+      ep = total_size - b
+      seqp,acc,don  = seqInfo.get_seq_and_scores(id,'+',bp,ep,only_seq=False,perform_checks=False)
+      seqp = reverse_complement(seqp)
 
-      self.assertEqual(seq,dna)
+      self.assertEqual(seq,seqp)
 
-      full_info = lt1.get_seq_and_scores(chromo,strand,pos,pos+38)
-      dna = full_info[0]
 
-      self.assertEqual(seq,dna)
+   def simple_check(self,settings,seqInfo,lt1):
+
+      print 'Checking sequences for some intervals...'
+      
+      intervals = [(0,10000),(545,874),(999,1234)]
+
+      for (b,e) in intervals:
+         self.check_reverse_strand_calculation(1,b,e,seqInfo)
+
+      for (b,e) in [(206,874),(545,874),(999,1234)]:
+         lt1 = LookupTable(settings)
+         _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
+         dna,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=False)
+
+         self.assertEqual(dna,_dna)
+         self.assertEqual(acc,_acc)
+         self.assertEqual(don,_don)     
 
-      dna,acc,don = seqInfo.get_seq_and_scores(1,'+',1,1369)
-      dna_,acc_,don_ = lt1.get_seq_and_scores(1,'+',1,1369)
 
+   def testWithArtificalData(self):
+      settings = {}
+
+      settings['genome_dir']           = 'test_data/'
+      settings['acceptor_scores_loc']  = 'test_data/acc'
+      settings['donor_scores_loc']     = 'test_data/don'
 
+      settings['genome_file_fmt']      = 'chromo%d.flat'
+      settings['splice_score_file_fmt']= 'chromo_%d%s'
 
-   def _testTableLyrataData(self):
-      g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
-      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
-      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+      allowed_fragments = [1]
+      settings['allowed_fragments'] = allowed_fragments
 
-      g_fmt = 'contig%d.dna.flat'
-      s_fmt = 'contig_%d%s'
+      accessWrapper = DataAccessWrapper(settings)
+      seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+      lt1 = LookupTable(settings)
 
-      lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,range(0,1099))
+      print 'Checking with toy data...'
+      self.simple_check(settings,seqInfo,lt1)
 
 
-   def tearDown(self):
-      pass
+def checks():
+   settings = {}
 
+   settings['genome_dir']           = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
+   settings['acceptor_scores_loc']  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+   settings['donor_scores_loc']     = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
 
-def check_wrapper():
-   g_dir    = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
-   acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
-   don_dir  = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+   settings['genome_file_fmt']      = 'chr%d.dna.flat'
+   settings['splice_score_file_fmt']= 'contig_%d%s'
 
-   g_fmt = 'contig%d.dna.flat'
-   s_fmt = 'contig_%d%s.Conf_cum'
+   allowed_fragments = [1]
+   settings['allowed_fragments'] = allowed_fragments
 
-   test = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+   accessWrapper = DataAccessWrapper(settings)
+   seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+   lt1 = LookupTable(settings)
 
-   for idx in range(1,100):
-      pos = test.get_genomic_fragment_fn(idx,'+')
-      neg = test.get_genomic_fragment_fn(idx,'-')
-      print pos,neg
-      assert os.path.exists(pos)
-      assert os.path.exists(neg)
+   print 'Checking with real data...'
+   simple_check(settings,seqInfo,lt1)
 
-      acc_fn,don_fn = test.get_splice_site_scores_fn(idx,'+')
-      print acc_fn,don_fn
-      assert os.path.exists(acc_fn)
-      assert os.path.exists(don_fn)
-      acc_fn,don_fn = test.get_splice_site_scores_fn(idx,'-')
-      print acc_fn,don_fn
-      assert os.path.exists(acc_fn)
-      assert os.path.exists(don_fn)
 
+def run():
+   print 'Creating some artifical data...'
+   create_mini_chromosome()
+   print 'Performing some checks...'
+   checks()
 
 if __name__ == '__main__':
-   #suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils)
-   suite = unittest.TestLoader().loadTestsFromTestCase(TestLookupTable)
+   suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceUtils)
    unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/tools/calculateAlignmentQuality.m b/tools/calculateAlignmentQuality.m
deleted file mode 100644 (file)
index bab0263..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-%
-% This script compares two alignments based on their EST overlaps
-%
-% The ground_truth and testrun
-%
-load /fml/ag-raetsch/home/fabio/tmp/zebrafish/confirmed_sequences.mat;
-testrun = genes;
-load /fml/ag-raetsch/share/projects/altsplicedata/zebrafish/confirmed_sequences.mat;
-ground_truth = genes;
-clear genes;
-
-disp('Loaded data...');
-
-fh = fopen('overlapping.pos','w+');
-
-for i = 1:length(ground_truth)
-   currentEntry = ground_truth(i);
-   currentExons = currentEntry.exons;
-   assert (length(currentEntry.transcripts) == length(currentEntry.exons));
-   numberOfEsts = length(currentEntry.transcripts);
-
-   if mod(i,100) == 0
-      fprintf('.')
-   end
-
-   for j = 1:length(testrun)
-      currentPred = testrun(j);
-
-      if ~strcmp(currentEntry.chr,currentPred.chr) || currentEntry.is_alt_spliced
-         continue;
-      end
-
-      start = currentEntry.start;
-      stop  = currentEntry.stop;
-      predStart = currentPred.start;
-      predStop = currentPred.stop;
-
-      % if both entries are not overlapping at all
-      if predStop < start || predStart > stop
-         continue;
-      end
-
-      currentPredExons = currentPred.exons;
-      numberOfPredEsts = length(currentPredExons);
-
-      % Loop over all ESTs in ground truth and testrun
-      for estIdx = 1:numberOfEsts
-         %disp(sprintf('estIdx %i ',estIdx));
-         numberOfExons = size(currentExons{estIdx},1);
-
-         for exonIdx = 1:(numberOfExons-1)
-            intronStart  = currentExons{estIdx}(exonIdx,2);
-            intronStop   = currentExons{estIdx}(exonIdx+1,1);
-
-            for predEstIdx = 1:numberOfPredEsts
-            numberOfPredExons = size(currentPredExons{predEstIdx},1);
-            currentESTName = currentPred.transcripts{predEstIdx};
-
-               for predExonIdx = 1:numberOfPredExons
-                  %%disp(sprintf('predExonIdx %i ',predExonIdx));
-                  predExonStart = currentPredExons{predEstIdx}(predExonIdx,1);
-                  predExonStop  = currentPredExons{predEstIdx}(predExonIdx,2);
-
-                  %disp('\n');
-                  %%disp(sprintf('%i %i %i %i %i %i\n',i,j,estIdx,predEstIdx,exonIdx,predExonIdx));
-               
-                  % est is covering full intron
-                  if intronStart > predExonStart && intronStop < predExonStop
-                     fprintf(fh,sprintf(' %s is completely overlapping from %d %d\n',currentESTName,intronStart,intronStop));
-                  % est is nested inside intron
-                  elseif intronStart < predExonStart && intronStop > predExonStop
-                     fprintf(fh,sprintf(' %s is completely inside intron from %d %d\n',currentESTName,predExonStart,predExonStop));
-                  % end of exonth is nested inside predExoniction
-                  elseif intronStart > predExonStart && predExonStop > intronStart && intronStop > predExonStop
-                     fprintf(fh,sprintf('%s is upstream overlapping from %d %d\n',currentESTName,intronStart,predExonStop));
-                  % predExoniction is nested inside exonth
-                  elseif intronStart < predExonStart && predExonStart < intronStop && intronStop < predExonStop
-                     fprintf(fh,sprintf('%s is downstream overlapping from %d %d\n',currentESTName,predExonStart,intronStop));
-                  else
-                     d=1;
-                  end
-               end
-            end
-         end
-      end
-   end
-end
diff --git a/tools/calculateSizes b/tools/calculateSizes
deleted file mode 100644 (file)
index d13139d..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-
-for line in open(sys.argv[1]):
-   line = line.strip()
-   slist = line.split(' ')
-   size = int(slist[-1]) - int(slist[-2])
-   print '%d %s' % (size,line)
diff --git a/tools/data_tools/Makefile b/tools/data_tools/Makefile
deleted file mode 100644 (file)
index a956933..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-CC=gcc
-CFLAGS=-ggdb -Wall -Wshadow -lm
-
-SRCS= parser.c\
-       sort_genes.c\
-       tools.c\
-       join.c\
-       debug_tools.c\
-       datastructures.c
-
-OBJS = $(SRCS:%.cpp=%.o)
-
-all: $(OBJS)
-       @ $(CC) $(CFLAGS) -o filterReads $(OBJS) filterReads.c
-
-test:
-       @ ./filterReads subset.gff subset.reads output 1
-
-join: $(OBJS)
-       @ $(CC) $(CFLAGS) -o test_join $(OBJS) main_join.c
-
-clean:
-       @ rm -rf *.o
diff --git a/tools/data_tools/datastructures.c b/tools/data_tools/datastructures.c
deleted file mode 100644 (file)
index 16ad6a6..0000000
+++ /dev/null
@@ -1,95 +0,0 @@
-#include <stdlib.h>
-#include <string.h>
-
-#include "datastructures.h"
-
-Read* read_alloc(int _size) {
-   Read* newRead = (Read*) malloc(sizeof(struct read));
-   newRead->chr         = 0;
-   newRead->pos         = 0;
-   newRead->seq         = 0;
-   newRead->id          = 0;
-   newRead->strand      = ' ';
-   newRead->mismatch    = 0;
-   newRead->occurrence  = 0;
-   newRead->size        = _size;
-   newRead->cut         = 0;
-   newRead->prb         = 0; 
-   newRead->cal_prb     = 0; 
-   newRead->chastity    = 0; 
-
-   return newRead;
-}
-
-void free_read(Read* oldRead) {
-   if(oldRead->seq != 0)
-      free(oldRead->seq);
-   if(oldRead->prb != 0)
-      free(oldRead->prb);
-   if(oldRead->cal_prb != 0)
-      free(oldRead->cal_prb);
-   if(oldRead->chastity != 0)
-      free(oldRead->chastity);
-}
-
-Read* create_read(int chr, int pos, char* seq, unsigned long id, char strand, int mismatch,
-int occurrence, int size, int cut, char* prb, char* cal_prb, char* chastity) {
-
-   Read* newRead = read_alloc(size);
-
-   newRead->chr         = chr;
-   newRead->pos         = pos;
-   int _size = strlen(seq);
-   newRead->seq         = malloc(sizeof(char)*(_size+1));
-   newRead->seq         = strncpy(newRead->seq,seq,_size);
-   newRead->seq[_size] = '\0';
-   newRead->id          = id;
-   newRead->strand      = strand;
-   newRead->mismatch    = mismatch;
-   newRead->occurrence  = occurrence;
-   newRead->size        = size;
-   newRead->cut         = cut;
-   newRead->prb         = malloc(sizeof(char)*(size+1));
-   newRead->prb         = strncpy(newRead->prb,prb,size);
-   newRead->prb[size] = '\0';
-   newRead->cal_prb     = malloc(sizeof(char)*(size+1));
-   newRead->cal_prb     = strncpy(newRead->cal_prb,cal_prb,size);
-   newRead->cal_prb[size] = '\0';
-   newRead->chastity    = malloc(sizeof(char)*(size+1));
-   newRead->chastity    = strncpy(newRead->chastity,chastity,size);
-   newRead->chastity[size] = '\0';
-
-   return newRead;
-}
-
-struct gene* gene_alloc(void) {
-   struct gene* newGene = (struct gene*) malloc(sizeof(struct gene));
-   newGene->exon_starts = malloc(2*sizeof(int));
-   newGene->exon_stops = malloc(2*sizeof(int));
-   newGene->num_exons = 0;
-   newGene->max_exons = 2;
-   newGene->id = 0;
-   return newGene;
-}
-
-void free_gene(struct gene* oldGene) {
-   free(oldGene->exon_starts);
-   free(oldGene->exon_stops);
-   if( oldGene->id != 0) {
-      free(oldGene->id);
-   }
-}
-
-void add_exon(struct gene* currentGene,int start, int stop) {
-   int idx = currentGene->num_exons;
-
-   if ( idx >= currentGene->max_exons) {
-      currentGene->exon_starts = realloc(currentGene->exon_starts,sizeof(int)*2*currentGene->max_exons);
-      currentGene->exon_stops = realloc(currentGene->exon_stops,sizeof(int)*2*currentGene->max_exons);
-      currentGene->max_exons *= 2;
-   }
-
-   currentGene->exon_starts[idx] = start;
-   currentGene->exon_stops[idx] = stop;
-   currentGene->num_exons++;
-}
diff --git a/tools/data_tools/datastructures.h b/tools/data_tools/datastructures.h
deleted file mode 100644 (file)
index c166136..0000000
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef __DATA_STRUCTURES_H__
-#define __DATA_STRUCTURES_H__
-
-typedef struct read {
-   int chr;
-   int pos;
-   char* seq;
-   unsigned long id;
-   char strand;
-   int mismatch;
-   int occurrence;
-   int size;
-   int cut;
-   char* prb;
-   char* cal_prb;
-   char* chastity;
-} Read;
-
-Read* read_alloc(int _size);
-void free_read(Read* oldRead);
-
-Read* create_read(int chr, int pos, char* seq, unsigned long id, char strand, int mismatch,
-int occurrence, int size, int cut, char* prb, char* cal_prb, char* chastity);
-
-struct gene {
-   int start;
-   int stop;
-   int* exon_starts;
-   int* exon_stops;
-   int num_exons;
-   int max_exons;
-   char strand;
-   char* id;
-};
-
-struct gene* gene_alloc(void);
-void free_gene(struct gene* oldGene);
-void add_exon(struct gene* currentGene,int start, int stop);
-
-#endif // __DATA_STRUCTURES_H__
diff --git a/tools/data_tools/debug_tools.c b/tools/data_tools/debug_tools.c
deleted file mode 100644 (file)
index ed97af8..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "debug_tools.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-void fassert(int expr,int line, char* file) {
-   if (! expr) {
-      printf("invalid fassert at line %d in file %s!\n",line,file);
-      exit(EXIT_FAILURE);
-   }
-}
diff --git a/tools/data_tools/filterReads.c b/tools/data_tools/filterReads.c
deleted file mode 100644 (file)
index 139c681..0000000
+++ /dev/null
@@ -1,774 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// The purpose of this program is to read a gff and a    
-// solexa reads file and create a data set used by QPalma. 
-//
-// Notes:
-//
-// - Both the read indices and the gff gene indices are one-based
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <string.h>
-#include <unistd.h>
-#include <math.h>
-
-#include "debug_tools.h"
-#include "join.h"
-#include "datastructures.h"
-
-#define _FILE_OFFSET_BITS == 64
-
-const char* line_format = "%d\t%d\t%s\t%lu\t%c\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n";
-
-const int read_size = 36;
-
-const int min_overlap = 1;
-const int max_overlap = 35;
-
-unsigned long read_nr = 1;
-unsigned long unfiltered_read_nr = 1;
-
-int parse_gff(char* filename,FILE* fid,struct gene*** allGenes);
-void sort_genes(struct gene*** allGenes, int numGenes);
-void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs);
-void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx);
-int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx);
-int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size);
-Tuple join_seq(char* new_seq, char* up_seq,int u_off, int u_size, char* down_seq, int d_off, int d_size, int d_range);
-
-static char *info = "Usage is:\n./filterReads gff reads output";
-
-void reverse_complement(char** seq, int len);
-
-char current_strand;
-
-/*
- * Some constants specifying the exact behavior of the filter
- *
- */
-
-//#define _FDEBUG_ 0
-//#define DEBUG_READ 38603
-
-/*
- * TODO:
- * - Check strand -> done simple (only if equal)
- * - check for [AC] and similar entries -> done simple (see function
- */
-
-#ifdef _FDEBUG_
-   if(read_nr == DEBUG_READ) {
-      printf("read nr %lu",read_nr);
-      printf("pos: %d %d %d %d\n",p_start,exon_stop,exon_start,p_stop);
-      printf("u/d range: %d %d\n",up_range,down_range);
-      printf("u/d size: %d %d\n",u_size,d_size);
-      printf("u/d off: %d %d\n",u_off,d_off);
-      printf("add_pos/down_range-d_size: %d %d\n",additional_pos,(down_range - d_size));
-
-      printf("******************\n");
-
-      printf("%s\n",up_read->seq);
-      //printf("%s\n",new_up_seq);
-
-      printf("******************\n");
-
-      printf("%s\n",down_read->seq);
-      //printf("%s\n",new_down_seq);
-
-      printf("******************\n");
-      printf("%s\n",new_seq);
-      printf("%s\n",new_prb);
-      printf("%s\n",new_cal_prb);
-      printf("%s\n",new_chastity);
-   }
-#endif // _FDEBUG_
-
-int combined_reads = 0;
-
-/*
- *
- *
- */
-
-void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* out_fs, FILE* unfiltered_out_fs) {
-   int status;
-
-   int buffer_size= 64;
-   int chr        = 0;
-   int pos        = 0;
-   char* seq      = malloc(sizeof(char)*buffer_size);
-   unsigned long id         = 0;
-   char strand    = ' ';
-   int mismatch   = 0;
-   int occurrence = 0;
-   int size       = 0;
-   int cut        = 0;
-   char* prb      = malloc(sizeof(char)*buffer_size);
-   char* cal_prb  = malloc(sizeof(char)*buffer_size);
-   char* chastity = malloc(sizeof(char)*buffer_size);
-
-   int reads_fid = fileno(reads_fs);
-   struct stat reads_stat;
-   if ( fstat(reads_fid,&reads_stat) == -1) {
-      perror("fstat");
-      exit(EXIT_FAILURE);
-   }
-   off_t reads_filesize = reads_stat.st_size;
-   printf("Reads file is of size %lu bytes\n",(unsigned long) reads_filesize);
-   //int numReads = reads_filesize / 178.0;
-
-   void *reads_area = mmap (NULL,reads_filesize,PROT_READ,MAP_PRIVATE,reads_fid,0);
-   if (reads_area == MAP_FAILED) {
-      perror("mmap");
-      exit(EXIT_FAILURE);
-   }
-   close(reads_fid);
-   printf("Successfully mapped %lu bytes of reads file into memory\n",(unsigned long)reads_filesize);
-
-   char* lineBeginPtr = (char*) reads_area;
-   char* lineEndPtr = (char*) reads_area;
-   char* end_of_mapped_area = ((char*) reads_area) + reads_filesize;
-
-   while (*lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
-   lineEndPtr++;
-
-   char* current_line = malloc(sizeof(char)*512);
-   memset(current_line,0,512);
-
-   int SIZE = 5000;
-   // initialize boundary arrays
-   Read** upstream_overlap  = malloc(sizeof(Read*)*SIZE);
-   Read** downstream_overlap= malloc(sizeof(Read*)*SIZE);
-   int uov,dov;
-   uov = dov = 0;
-
-   int skippedLinesCounter = 0;
-
-   int prev_exon_start = -1;
-   int prev_exon_stop = -1;
-   int cur_exon_start = -1;
-
-   unsigned long line_size = lineEndPtr - lineBeginPtr;
-   //printf("distance is %lu\n",line_size);
-   strncpy(current_line,lineBeginPtr,line_size);
-   current_line[line_size] = '\0';
-   //printf("%s test",current_line);
-
-   int gene_idx = 0;
-   int exon_idx = 1;
-   struct gene* currentGene = (*allGenes)[gene_idx];
-   char* gene_id = currentGene->id;
-
-   int skippedReadCtr = 0;
-   int uselessReadCtr = 0;
-   int exonicReadCtr = 0;
-
-   int currentOverlapCtr = 0;
-   int previousOverlapCtr = 0;
-
-   int multioccurReadCtr = 0;
-
-   Read* currentRead;
-   int up_idx, down_idx;
-
-   int readCtr = 0;
-   int wrong_strand_ctr = 0;
-   int read_within_gene_ctr = 0;
-   int read_outside_gene_ctr = 0;
-   int read_start, read_stop;
-   // start of the parsing loop 
-  
-      while(1) {
-         if (gene_idx == numGenes || strcmp(current_line,"") == 0)
-            break;
-
-         gene_id = currentGene->id;
-
-         //if (readCtr != 0 && readCtr % 1000000 == 0) {
-         //   printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
-         //   printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
-         //   printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
-         //   printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
-         //   printf("\t%d useless reads\n",uselessReadCtr);
-         //   printf("\t%d skipped reads\n",skippedReadCtr);
-         //   printf("\t%d multioccurring\n",multioccurReadCtr);
-         //   printf("\t%d wrong_strand\n",wrong_strand_ctr);
-         //   printf("%d within gene\n",read_within_gene_ctr);
-         //   printf("%d outside gene\n",read_outside_gene_ctr);
-         //   printf("%d reads were totally exonic\n",exonicReadCtr);
-         //   printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
-         //   printf("%d reads where newly combined from two original reads\n",combined_reads);
-         //   printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
-         //}
-
-         // pos of the reads is one-based
-         status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
-         if (status < 12) {
-            skippedLinesCounter++;
-            goto next_read;
-         }
-
-         // if the read is occurring several times elsewhere then get rid of it 
-         if ( occurrence != 1 ) {
-            multioccurReadCtr++;
-            goto next_read;
-         }
-
-         //printf("before rc %s\n",seq);
-         if (current_strand == 'P')
-            reverse_complement(&seq,strlen(seq));
-         //printf("after rc %s\n",seq);
-
-         //printf("new read: %s at %d\n",seq,pos);
-   
-         // define read start and stop positions
-         read_start = pos;
-         read_stop  = pos + read_size-1;
-
-         FA(strlen(seq) >= read_size);
-
-         FA(currentGene != 0);
-
-         if ( currentGene->start <= read_start && read_stop <= currentGene->stop) { // read is within gene borders
-            read_within_gene_ctr++;
-
-            exon_label:
-
-            if (exon_idx == currentGene->num_exons) {
-               gene_idx++;
-               exon_idx = 1;
-               currentGene = (*allGenes)[gene_idx];
-               while( currentGene == 0 && gene_idx < numGenes) {
-                  currentGene = (*allGenes)[gene_idx];
-                  gene_idx++;
-               }
-               continue;
-            }
-
-            prev_exon_start = currentGene->exon_starts[exon_idx-1];
-            prev_exon_stop = currentGene->exon_stops[exon_idx-1];
-            cur_exon_start = currentGene->exon_starts[exon_idx];
-
-            //printf("id: %s,exon_idx: %d intron: %d %d read start/stop: %d / %d\n",currentGene->id,exon_idx,prev_exon_stop,cur_exon_start,read_start,read_stop);
-
-            if ( (cur_exon_start - prev_exon_stop - 1) < min_overlap || cur_exon_start < read_start ) { // go to next exon
-
-               if (uov != 0 && dov != 0)
-                  combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
-
-               for(up_idx=0;up_idx<uov;up_idx++) {
-                  free_read(upstream_overlap[up_idx]);
-                  free(upstream_overlap[up_idx]);
-               }
-
-               for(down_idx=0;down_idx<dov;down_idx++) {
-                  free_read(downstream_overlap[down_idx]);
-                  free(downstream_overlap[down_idx]);
-               }
-
-               uov = dov = 0;
-
-               exon_idx++;
-               goto exon_label;
-            }
-
-            if ( prev_exon_start <= read_start && read_stop <= prev_exon_stop ) { // read is inside previous exon
-
-               // output of unused i.e. unspliced reads
-               fprintf(unfiltered_out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
-               unfiltered_read_nr,chr,strand,seq,0,read_size,prb,cal_prb,chastity,gene_id,read_start,-1,-1,read_stop,-1);
-               unfiltered_read_nr++;
-               
-               exonicReadCtr++;
-               goto next_read;
-            }
-
-            if ( read_stop < prev_exon_stop ) { // go to next read 
-               //printf("%d\t%d\t%d\n",read_start,prev_exon_start,prev_exon_stop);
-               //if( exon_idx > 1) {
-               //   printf("%d\t%d\n", currentGene->exon_starts[exon_idx-2], currentGene->exon_stops[exon_idx-2]);
-               //   printf("---\n");
-               //}
-
-               uselessReadCtr++;
-               goto next_read;
-            }
-
-            // if this position is reached the read is somehow overlapping or
-            // exactly on the exon boundary.
-            if ( (prev_exon_stop - read_start + 1) >= min_overlap && (prev_exon_stop - read_start + 1) <= max_overlap ) { // read overlaps with previous exon boundary
-               //printf("%s\n",current_line);
-               previousOverlapCtr++;
-               currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
-               assert (uov < SIZE);
-               upstream_overlap[uov] = currentRead;
-               uov++;
-               goto next_read;
-            }
-         
-            if ( ( read_stop - cur_exon_start + 1) >= min_overlap && (read_stop - cur_exon_start + 1) <= max_overlap ) { // read overlaps with current exon boundary
-               //printf("%s\n",current_line);
-               currentOverlapCtr++;
-               currentRead = create_read(chr,read_start,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity);
-               assert (dov < SIZE);
-               downstream_overlap[dov] = currentRead;
-               dov++;
-               goto next_read;
-            }
-
-            uselessReadCtr++;
-            goto next_read; // read was not useful i.e. not overlapping/starting at exon boundaries
-
-         } else { // read is not within gene borders
-            read_outside_gene_ctr++;
-
-            if (uov != 0 && dov != 0)
-               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
-
-            for(up_idx=0;up_idx<uov;up_idx++) {
-               free_read(upstream_overlap[up_idx]);
-               free(upstream_overlap[up_idx]);
-            }
-
-            for(down_idx=0;down_idx<dov;down_idx++) {
-               free_read(downstream_overlap[down_idx]);
-               free(downstream_overlap[down_idx]);
-            }
-
-            uov = dov = 0;
-
-            if ( currentGene->stop < read_stop ) { // go to next gene
-               gene_idx++;
-               exon_idx = 1;
-               currentGene = (*allGenes)[gene_idx];
-               while( currentGene == 0 && gene_idx < numGenes) {
-                  currentGene = (*allGenes)[gene_idx];
-                  gene_idx++;
-               }
-               //printf("currentGene->start / currentGene->stop %d/%d pos is %d\n",currentGene->start,currentGene->stop,pos);
-               continue;
-            }
-
-            if ( read_start < currentGene->start ) { // go to next read
-               skippedReadCtr++;
-               
-               next_read:
-               
-               lineBeginPtr = lineEndPtr;
-               while (*(char*)lineEndPtr != '\n' && lineEndPtr != end_of_mapped_area) lineEndPtr++;
-               lineEndPtr++;
-               readCtr += 1;
-               current_line = strncpy(current_line,lineBeginPtr,lineEndPtr-lineBeginPtr);
-               current_line[lineEndPtr-lineBeginPtr] = '\0';
-               continue;
-            }
-         }
-      }
-
-   //} // end of for all strands
-
-   if (uov != 0 && dov != 0)
-      combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uov,downstream_overlap,dov,out_fs,gene_id,exon_idx);
-
-   for(up_idx=0;up_idx<uov;up_idx++) {
-      free_read(upstream_overlap[up_idx]);
-      free(upstream_overlap[up_idx]);
-   }
-
-   for(down_idx=0;down_idx<dov;down_idx++) {
-      free_read(downstream_overlap[down_idx]);
-      free(downstream_overlap[down_idx]);
-   }
-
-   uov = dov = 0;
-
-   free(upstream_overlap);
-   free(downstream_overlap);
-
-   printf("Processed %d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
-   printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
-   printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
-   printf("\t%d useless reads\n",uselessReadCtr);
-   printf("\t%d skipped reads\n",skippedReadCtr);
-   printf("\t%d multioccurring\n",multioccurReadCtr);
-   printf("\t%d wrong_strand\n",wrong_strand_ctr);
-   printf("%d reads were totally exonic\n",exonicReadCtr);
-   printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
-   printf("%d reads where newly combined from two original reads\n",combined_reads);
-   printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
-
-   status = munmap(reads_area,reads_filesize);
-   if(status != 0)
-      perror("munmap");
-
-   free(current_line);
-   free(seq);
-   free(prb);
-   free(cal_prb);
-   free(chastity);
-}
-
-
-void combine_info(int exon_stop, int exon_start, Read** upstream, int up_size, Read** downstream, int down_size, FILE* out_fs,const char* gene_id, int exon_idx) {
-   int up_idx, down_idx, success;
-
-   char* up_used_flag = calloc(up_size,sizeof(char));
-   char* down_used_flag = calloc(down_size,sizeof(char));
-
-   Read* currentUpRead;
-   Read* currentDownRead;
-
-   for(up_idx=0;up_idx<up_size;up_idx++) {
-      if( up_used_flag[up_idx] == 1)
-         continue;
-
-      currentUpRead = upstream[up_idx];
-
-      for(down_idx=0;down_idx<down_size;down_idx++) {
-
-         if( up_used_flag[up_idx] == 1 || down_used_flag[down_idx] == 1)
-            continue;
-
-         currentDownRead = downstream[down_idx];
-
-         if(currentUpRead->strand != currentDownRead->strand)
-            continue;
-
-         success = join_reads(exon_stop,exon_start,currentUpRead,currentDownRead,out_fs,gene_id,exon_idx);
-         
-         if(success == 1) {
-            up_used_flag[up_idx] = 1;
-            down_used_flag[down_idx] = 1;
-         }
-      }
-   }
-
-   exit(0);
-   free(up_used_flag);
-   free(down_used_flag);
-}
-
-/*
- * Now we join the candidate reads wherever possible according to the following
- * scheme:
- *
- * ACGTACGTCA GTXXXXXXXXAG ACGTAGACGT
- * p1       e1             e2       p2  
- *
- * 
- *
- *
- */
-
-int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FILE* out_fs,const char* gene_id, int exon_idx) {
-   // range of possible sequence length on exon side
-   int up_read_start = up_read->pos;
-   //int up_read_stop  = up_read->pos+read_size-1;
-
-   int down_read_start = down_read->pos;
-   int down_read_stop  = down_read->pos+read_size-1;
-
-   int up_range   = exon_stop - up_read_start + 1;
-   int down_range = down_read_stop - exon_start + 1;
-   int retval;
-
-   int u_size, d_size;
-   u_size = d_size = -1;
-
-   if(up_range+down_range < read_size)
-      return 0;
-
-   if (read_nr % 2 != 0) {
-      d_size = down_range;
-      u_size = read_size - d_size;
-   } else {
-      u_size = up_range;
-      d_size = read_size - u_size;
-   }
-
-   if( u_size > up_range || d_size > down_range)
-      return 0;
-
-   int p_start  = exon_stop  - u_size + 1;
-   int p_stop   = exon_start + d_size - 1;
-
-   int u_off = p_start - up_read_start;
-   int d_off = exon_start - down_read_start;
-
-   FA( u_off >= 0 && d_off >= 0 );
-   FA( exon_stop - p_start + p_stop - exon_start + 2 == read_size);
-   FA( u_size + d_size == read_size );
-
-   // seems reasonable up to here
-
-   int buf_size = 4*read_size;
-   char* new_seq = malloc(sizeof(char)*buf_size);
-   memset(new_seq,'z',sizeof(char)*buf_size);
-
-   if ( current_strand == 'P' ) {
-      printf("flipping read sequences...\n");
-      printf("%s %s\n",up_read->seq,down_read->seq);
-      char *tmp = malloc(sizeof(char)*(strlen(up_read->seq)+1));
-      strncpy(tmp,up_read->seq,strlen(up_read->seq));
-      tmp[strlen(up_read->seq)]='\0';
-
-      realloc(up_read->seq,sizeof(char)*(strlen(down_read->seq)+1));
-      strncpy(up_read->seq,down_read->seq,strlen(down_read->seq));
-      up_read->seq[strlen(down_read->seq)] = '\0';
-
-      if (up_read->seq == NULL)
-         perror("realloc\n");
-
-      realloc(down_read->seq,sizeof(char)*strlen(tmp)+1);
-      strncpy(down_read->seq,tmp,strlen(tmp));
-      down_read->seq[strlen(tmp)] = '\0';
-      
-      if (down_read->seq == NULL)
-         perror("realloc\n");
-      
-      free(tmp);
-      printf("flipping done...\n");
-      printf("%s %s\n",up_read->seq,down_read->seq);
-   }
-   
-   printf("start joining...\n");
-   Tuple jinfo = join_seq(new_seq,up_read->seq,u_off,u_size,down_read->seq,d_off,d_size,down_range);
-   printf("end of joining...\n");
-
-   printf("new_seq is %s\n",new_seq);
-
-
-   int cut_pos = jinfo.first;
-   int additional_pos = jinfo.second;
-   printf("jinfo contains %d/%d\n",jinfo.first,jinfo.second);
-
-   buf_size = read_size+1+additional_pos;
-
-   printf("allocating quality arrays (size=%d)...\n",buf_size);
-   char* new_prb        = malloc(sizeof(char)*buf_size);
-   char* new_cal_prb    = malloc(sizeof(char)*buf_size);
-   char* new_chastity   = malloc(sizeof(char)*buf_size);
-
-   if (new_prb == NULL || new_cal_prb == NULL || new_chastity == NULL)
-      perror("malloc\n");
-   
-   if( jinfo.first == -1 || (additional_pos > (down_range - d_size)) ) {
-      retval = 0;
-      goto free;
-   }
-
-   printf("joining qualities...\n");
-   strncpy(new_prb, up_read->prb+u_off, u_size);
-   strncpy(new_prb+u_size, down_read->prb+d_off, d_size+additional_pos);
-   new_prb[buf_size-1] = '\0';
-
-   strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
-   strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size+additional_pos);
-   new_cal_prb[buf_size-1] = '\0';
-
-   strncpy(new_chastity, up_read->chastity+u_off, u_size);
-   strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
-   new_chastity[buf_size-1] = '\0';
-   printf("end of joining qualities...\n");
-
-   //printf("old reads: %s %s (%d %d %d/%d)\n",up_read->seq,down_read->seq,up_read->pos,down_read->pos,u_off,d_off);
-   //printf("new read: %s %d %d\n",new_seq,cut_pos,u_size);
-   //if ( current_strand == 'P' ) {
-   //   int alpha   = read_size - u_off - 1;
-   //   int beta    = alpha - u_size ;
-   //   p_start     = up_read->pos + beta + 1;
-   //   exon_stop   = up_read->pos + alpha;
-   //   alpha   = read_size - d_off - 1;
-   //   beta    = alpha - (read_size - u_size);
-   //   exon_start  = down_read->pos + beta + 1; 
-   //   p_stop      = down_read->pos + alpha;
-   //}
-
-   int status = 1; //fitting(up_read->prb,down_read->prb,u_off,d_off,u_size,d_size);
-
-   if(status != 1) {
-      retval = 0;
-      goto free;
-   }
-
-   retval = status;
-
-   fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
-      read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
-
-   read_nr++;
-   combined_reads++;
-
-   free:
-   free(new_seq);
-   free(new_prb);
-   free(new_cal_prb);
-   free(new_chastity);
-
-   return retval;
-}
-
-static char* translation = "T G   C            A      [ ]";
-
-void reverse_complement(char** seq, int len) {
-   int idx;
-   char *temp = malloc(sizeof(char)*len);
-   for(idx=0;idx<len;idx++)
-      temp[idx] = translation[(*seq)[idx]-65];
-
-   idx=0;
-   int temp_idx=len-1;
-   while(1) {
-      if (temp[temp_idx] == ']') {
-         (*seq)[idx] = '[';
-         (*seq)[idx+1] = temp[temp_idx-2];
-         (*seq)[idx+2] = temp[temp_idx-1];
-         (*seq)[idx+3] = ']';
-         idx      += 4;
-         temp_idx -= 4;
-      } else {
-         (*seq)[idx] = temp[temp_idx];
-         idx++;
-         temp_idx--;
-      }
-
-      if (idx == len || temp_idx == -1)
-         break;
-   }
-   free(temp);
-}
-
-
-void print_read(Read* cRead) {
-   printf(line_format,
-   cRead->chr, cRead->pos, cRead->seq, cRead->id,
-   cRead->strand, cRead->mismatch, cRead->occurrence,
-   cRead->size, cRead->cut, cRead->prb, cRead->cal_prb,
-   cRead->chastity);
-}
-
-
-void open_file(const char* filename, const char* mode, FILE** fs) {
-   *fs = fopen(filename,mode);
-   if(*fs == NULL) {
-      printf("Error: Could not open file: %s",filename);
-      exit(EXIT_FAILURE);
-   }
-}
-
-/*
- * The program expects 7 arguments, namely:
- *
- * - The strand to be used
- * - The filename of the gff file
- * - The filename of the reads file
- * - The name of the spliced reads ouput file
- * - The name of the unspliced reads ouput file
- * - The offset for counting new spliced reads
- * - The offset for counting new unspliced reads
- *
- */
-
-int main(int argc, char* argv[]) {
-
-   if(argc != 8) {
-      printf("%s\n",info);
-      exit(EXIT_FAILURE);
-   }
-
-   current_strand = argv[1][0];
-   printf("Current strand is %c\n",current_strand);
-
-   int status;
-   int filenameSize = 256;
-   char* gff_filename = malloc(sizeof(char)*filenameSize);
-
-   strncpy(gff_filename,argv[2],filenameSize);
-
-   FILE *gff_fs;
-   FILE *reads_fs;
-   FILE *out_fs;
-   FILE *unfiltered_out_fs;
-
-   // open file streams for all needed input/output files
-   open_file(argv[2],"r",&gff_fs);
-   open_file(argv[3],"r",&reads_fs);
-   open_file(argv[4],"w",&out_fs);
-   open_file(argv[5],"w",&unfiltered_out_fs);
-
-   read_nr = strtoul(argv[6],NULL,10);
-   read_nr++;
-
-   unfiltered_read_nr = strtoul(argv[7],NULL,10);
-   unfiltered_read_nr++;
-
-   // allocate and load all genes and then close the gff file stream
-   struct gene** allGenes;
-   int numGenes = parse_gff(gff_filename,gff_fs,&allGenes);
-   status = fclose(gff_fs);
-   free(gff_filename);
-   if(status != 0)
-      printf("closing of gff filestream failed!\n");
-
-   printf("Successfully parsed gff file! Found %d genes.\n",numGenes);
-
-   // check if allGenes is sorted. if not throw away those genes that do not
-   // occur in the sorted order
-   int g_idx;
-   struct gene* currentGene = 0;
-   int nulled_genes=0;
-   int old_gene_stop = -1;
-   for(g_idx=0;g_idx<numGenes;g_idx++) {
-      currentGene = allGenes[g_idx];
-
-      if (! (currentGene->start < currentGene->stop))
-         printf("Invalid positions for gene %s!\n",currentGene->id);
-
-      if (! (old_gene_stop <  currentGene->start ) ) {
-         printf("Gene %s is overlapping\n",currentGene->id);
-         old_gene_stop = currentGene->stop;
-         free_gene(allGenes[g_idx]);
-         free(allGenes[g_idx]);
-         allGenes[g_idx] = 0;
-         nulled_genes++;
-         continue;
-      }
-      old_gene_stop = currentGene->stop;
-   }
-
-   printf("Found %d unordered genes.\n",nulled_genes);
-   int gidx, eidx;
-   int exon_cov = 0;
-   for(gidx=0;gidx<numGenes;gidx++) {
-      if (allGenes[gidx] == 0)
-         continue;
-
-      for(eidx=0;eidx<allGenes[gidx]->num_exons;eidx++) {
-         exon_cov += allGenes[gidx]->exon_stops[eidx] - allGenes[gidx]->exon_starts[eidx];
-   }}
-   printf("Exon coverage is %f\n",(double)exon_cov/30432563);
-
-   // now that we loaded all neccessary data we start to process the reads
-   process_reads(reads_fs,&allGenes,numGenes,out_fs,unfiltered_out_fs);
-
-   // free all allocated ressources
-   for(g_idx=0;g_idx<numGenes;g_idx++) {
-      if(allGenes[g_idx] != 0) {
-         free_gene(allGenes[g_idx]);
-         free(allGenes[g_idx]);
-      }
-   }
-   free(allGenes);
-
-   status = fclose(reads_fs);
-   status = fclose(out_fs);
-   if(status != 0)
-      perror("fclose");
-      
-   return 0;
-}
diff --git a/tools/data_tools/filterReads.py b/tools/data_tools/filterReads.py
deleted file mode 100644 (file)
index 93fb93d..0000000
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*- 
-
-import sys
-import pdb
-
-import qpalma
-from qpalma.tools.parseGff import parse_gff
-
-class Read:
-
-   def __init__(self):
-      pass
-
-def createRead(line):
-   chr,pos,seq,id,strand,mismatch,occurrence,size,cut,prb,cal_prb,chastity = line.split()
-   newRead = Read()
-   newRead.chr = int(chr)
-   newRead.pos = int(pos)
-   newRead.seq = seq
-
-   if strand == 'D':
-      newRead.strand = '+'
-   if strand == 'P':
-      newRead.strand = '-'
-
-   newRead.mismatch = int(mismatch)
-   newRead.occurrence = int(occurrence)
-   newRead.size = int(size)
-
-   newRead.prb = prb
-   newRead.cal_prb = cal_prb
-   newRead.chastity = chastity
-
-   return newRead
-
-
-def filter(gff_file,reads,out):
-   read_size = 36
-
-   d,allGenes = parse_gff(gff_file)
-   del d
-
-   print 'Found %d genes.' % len(allGenes)
-
-   old_gene_pos = -1
-   for g_idx in range(len(allGenes)):
-      currentGene = allGenes[g_idx]
-
-      if not old_gene_pos < currentGene.start:
-         allGenes[g_idx] = 'nil'
-
-      old_gene_pos = currentGene.stop
-   
-   allGenes = [e for e in allGenes if e != 'nil']
-
-   print '%d genes remained.' % len(allGenes)
-
-   read_fh = open(reads)
-
-   for currentGene in allGenes:
-      currentGene.overlappingReads = []
-      currentRead = createRead(read_fh.next())
-
-      while currentGene.start < currentRead.pos and currentRead.pos+read_size-1 < currentGene.stop:
-         currentGene.overlappingReads.append(currentRead)
-         currentRead = createRead(read_fh.next())
-
-   #for currentGene in allGenes:
-   #   for (exon_start,exons_stop) in currentGene.exons:
-
-
-   out_fh = open(out,'w+')
-
-
-   out_fh.close()
-
-if __name__ == '__main__':
-   assert len(sys.argv) == 4, 'Usage: '
-   (gff_file,reads,out) = sys.argv[1:4]
-   filter(gff_file,reads,out)
-   
diff --git a/tools/data_tools/parser.c b/tools/data_tools/parser.c
deleted file mode 100644 (file)
index c314e05..0000000
+++ /dev/null
@@ -1,166 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <regex.h>
-#include <assert.h>
-
-#include "datastructures.h"
-
-/* 
- * Note: 
- * In the current gff files from the TAIR repository
- * the exons boundaries are defined as follows: 
- *
- * exon starts at:
- *
- * agxy
- *    ^
- * whereas exon stops at:
- *
- * xygt
- *   ^
- *
- *
- */
-
-char* get_regerror (int errcode, regex_t *compiled) {
-     size_t length = regerror (errcode, compiled, NULL, 0);
-     char *buffer = malloc (length);
-     (void) regerror (errcode, compiled, buffer, length);
-     return buffer;
-}
-
-char* get_id(regex_t rx, const char* desc) {
-   size_t nmatch = 1;
-   regmatch_t* all_matches = malloc(sizeof(regmatch_t)*nmatch);
-   int regerr = regexec(&rx, desc, nmatch, all_matches, REG_NOTBOL);
-
-   if ( regerr != 0 ) {
-      char* mes = get_regerror(regerr,&rx);
-      perror(mes);
-      exit(EXIT_FAILURE);
-   }
-
-   //printf("%s\n",desc);
-
-   int start = all_matches[0].rm_so+3;
-   int end = all_matches[0].rm_eo-1;
-   assert( start <= end);
-
-   int id_size = end - start;
-   char* id = malloc(sizeof(char)*(id_size+1));
-   
-   strncpy(id,desc+start,id_size);
-   id[id_size] = '\0';
-   //printf("desc is %s, id is %s\n",desc,id);
-  
-   free(all_matches);
-
-   return id;
-}
-  
-int parse_gff(char *filename, FILE* fid,struct gene*** allGenes) {
-
-   int buffer_size = 256;
-   char* chr   = malloc(sizeof(char)*buffer_size);
-   char* blah  = malloc(sizeof(char)*buffer_size);
-   char* id    = malloc(sizeof(char)*buffer_size);
-   char* desc    = malloc(sizeof(char)*buffer_size);
-   int start  = 0;
-   int stop   = 0;
-   char* xy    = malloc(sizeof(char)*4);
-   char* strand = malloc(sizeof(char)*4);
-   char* xz    = malloc(sizeof(char)*4);
-
-   regex_t rx;
-   const char* pattern = "ID=[^;]*;";
-   if ( regcomp(&rx, pattern, 0) != 0) {
-      perror("regcomp");
-      exit(EXIT_FAILURE);
-   }
-
-   // do one pass through the gff line to determine the number of
-   // genes
-   int numGenes = 0;
-   int status = 0;
-   while(1) {
-      status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
-      if(status == EOF)
-         break;
-
-      if ( status >= 5 && strcmp(id,"gene")==0)
-         numGenes++;
-
-   }
-   freopen(filename,"r",fid);
-
-   int idx = 0;
-   (*allGenes) = malloc(sizeof(struct gene*)*numGenes);
-   (*allGenes)[idx] = NULL;
-
-   int skippedLinesCounter = 0;
-   while(1) {
-      status = fscanf(fid,"%s\t%s\t%s\t%d\t%d\t%c\t%c\t%c\t%s\n",chr,blah,id,&start,&stop,xy,strand,xz,desc);
-      if(status == EOF)
-         break;
-
-      if (status < 7) {
-         skippedLinesCounter++;
-         continue;
-      }
-
-      if (strcmp(id,"gene")==0) {
-         if ( (*allGenes)[idx] !=NULL )
-            idx++;
-       
-         (*allGenes)[idx] = gene_alloc();
-         (*allGenes)[idx]->start = start;
-         (*allGenes)[idx]->stop = stop;
-
-         //printf("strand: %s %d\n",strand,strcmp(strand,"+"));
-
-         if (strcmp(strand,"+") == 0) {
-            (*allGenes)[idx]->strand = 'D';
-         } else {
-            if (strcmp(strand,"-") == 0)
-               (*allGenes)[idx]->strand = 'P';
-            else
-               (*allGenes)[idx]->strand = 'z';
-         }
-         assert( (*allGenes)[idx]->strand != 'z' );
-
-         (*allGenes)[idx]->id = get_id(rx,desc);
-         //printf("gene start/stop: %d/%d\n",start,stop);
-         continue;
-      }
-
-      if (strcmp(id,"exon")==0) {
-         add_exon((*allGenes)[idx],start,stop);
-         //printf("exon start/stop: %d/%d\n",start-1,stop);
-         continue;
-      }
-
-      if (strcmp(id,"pseudogene")==0) {
-         if ( (*allGenes)[idx] !=NULL )
-            idx++;
-      }
-   }
-
-   if ( (*allGenes)[idx] !=NULL )
-      idx++;
-
-   //printf("allGenes[0] is %d\n",(*allGenes)[0]);
-   //printf("allGenes[1] is %d\n",(*allGenes)[1]);
-   //printf("Skipped %d lines.\n",skippedLinesCounter);
-
-   regfree(&rx);
-   free(chr);
-   free(blah);
-   free(id);
-   free(desc);
-   free(xy);
-   free(strand);
-   free(xz);
-
-   return numGenes;
-}
diff --git a/tools/data_tools/read.h b/tools/data_tools/read.h
deleted file mode 100644 (file)
index d2c55b6..0000000
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef __READ_H__
-#define __READ_H__
-
-typedef struct read {
-   int chr;
-   int pos;
-   unsigned long id;
-   char strand;
-   int mismatch;
-   int occurrence;
-   int size;
-   int cut;
-
-   char* seq;
-   char* prb;
-   char* cal_prb;
-   char* chastity;
-} Read;
-
-void init_read(struct read *r, int size) {
-   r->chr         = 0;
-   r->pos         = 0;
-   r->id          = 0;
-   r-> strand     = ' ';
-   r->mismatch    = 0;
-   r->occurrence  = 0;
-   r->size        = size;
-   r->cut         = 0;
-
-   r->seq      = malloc(sizeof(char)*(r->size));
-   r->prb      = malloc(sizeof(char)*(r->size));
-   r->cal_prb  = malloc(sizeof(char)*(r->size));
-   r->chastity = malloc(sizeof(char)*(r->size));
-}
-
-void create_read(Read* newRead, int chr,int pos, char* seq, unsigned long id, char strand, int mismatch, int occurrence, int size, int cut, char* prb, char* cal_prb, char* chastity) {
-   newRead->chr         = chr;
-   newRead->pos         = pos;
-   newRead->id          = id;
-   newRead-> strand     = strand;
-   newRead->mismatch    = mismatch;
-   newRead->occurrence  = occurrence;
-   newRead->size        = size;
-   newRead->cut         = cut;
-
-   newRead->seq      = malloc(sizeof(char)*(size));
-   newRead->prb      = malloc(sizeof(char)*(size));
-   newRead->cal_prb  = malloc(sizeof(char)*(size));
-   newRead->chastity = malloc(sizeof(char)*(size));
-
-   strncpy(newRead->seq,seq,size);
-   strncpy(newRead->prb,prb,size);
-   strncpy(newRead->cal_prb,cal_prb,size);
-   strncpy(newRead->chastity,chastity,size);
-}
-
-
-
-#endif // __READ_H__
diff --git a/tools/data_tools/sort_genes.c b/tools/data_tools/sort_genes.c
deleted file mode 100644 (file)
index 221b61a..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "datastructures.h"
-
-void sort_genes(struct gene*** allGenes, int numGenes) {
-
-   int gstart = (*allGenes)[0]->start;
-   int gstop = (*allGenes)[0]->stop;
-   int pre_start;
-   int pre_stop;
-
-   //struct gene* tmp;
-   int round;
-   int gidx;
-
-   for(round=0;round<1;round++) {
-      printf("Next round!\n");
-
-      for(gidx=1;gidx<numGenes;gidx++) {
-         pre_start = gstart;
-         pre_stop = gstop;
-         gstart = (*allGenes)[gidx]->start;
-         gstop  = (*allGenes)[gidx]->stop;
-
-         if ( pre_stop > gstart ) {
-            printf("Swapping genes!\n");
-            printf("idx is %d\n",gidx);
-            //printf("pos are %d %d - %d %d\n",pre_start,pre_stop,gstart,gstop);
-            //(*allGenes)[gidx-1] = (*allGenes)[gidx];
-            //(*allGenes)[gidx] = tmp;
-            //printf("pos with new indices: %d %d - %d %d\n",(*allGenes)[gidx-1]->start,(*allGenes)[gidx-1]->stop,(*allGenes)[gidx]->start,(*allGenes)[gidx]->stop);
-            gstart = (*allGenes)[gidx]->start;
-            gstop = (*allGenes)[gidx]->stop;
-            (*allGenes)[gidx] = NULL;
-         }
-      }
-   }
-   printf("End of swapping elements!\n");
-}
diff --git a/tools/data_tools/tools.c b/tools/data_tools/tools.c
deleted file mode 100644 (file)
index 35cb015..0000000
+++ /dev/null
@@ -1,99 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "debug_tools.h"
-
-int fitting(char* up_prb, char* down_prb, int u_off, int d_off, int u_size, int d_size) {
-   double current_mean_up = 0;
-   double current_mean_down = 0;
-
-   int w_size = 2;
-
-   int idx;
-
-   //printf("prb %s\n",up_prb);
-   //printf("prb %s\n",down_prb);
-
-   for(idx=0;idx<w_size;idx++) {
-      current_mean_up += up_prb[u_off+u_size+idx]-50;
-      current_mean_up += up_prb[u_off+u_size-idx]-50;
-      current_mean_up -= up_prb[idx]-50;
-
-      current_mean_down += up_prb[d_off+idx]-50;
-      current_mean_down += up_prb[d_off+idx]-50;
-      current_mean_down -= up_prb[idx]-50;
-   }
-
-   current_mean_up /= (2*w_size+1);
-   current_mean_down /= (2*w_size+1);
-
-   double ratio;
-   if( current_mean_up  > current_mean_down)
-      ratio = current_mean_down / current_mean_up;
-   else
-      ratio = current_mean_up / current_mean_down;
-
-   //printf("ratio is %f\n",ratio);
-
-   if (ratio < 0.35)
-      return 0;
-
-   return 1;
-}
-
-/*
- * As we can have 3 kinds of ambiguities
- *
- * [AG]
- * [A-]
- * [-A]
- *
- * we want to remove/store these ambiguities in the filtered reads
- */
-
-int remove_ambiguities(char * old_seq, char* new_seq) {
-
-   int idx=0;
-   int new_idx = 0;
-   int old_seq_size = strlen(old_seq);
-
-   int est_deletions_ctr = 0;
-
-   while(idx<old_seq_size) {
-      //printf("Current elem %c pos %d %d\n",old_seq[idx],idx,new_idx);
-      if (old_seq[idx] == ']') {
-         idx++;
-         continue;
-      }
-
-      if (old_seq[idx] == '[') {
-
-         // we have a indel either on the dna or read sequence
-         if (old_seq[idx+1] == '-' || old_seq[idx+2] == '-') {
-            
-            if(old_seq[idx+2] == '-')
-               est_deletions_ctr += 1;
-
-            while(1) { 
-               new_seq[new_idx] = old_seq[idx];
-               if(old_seq[idx] == ']')
-                  break;
-               new_idx++;
-               idx++;
-            }
-
-         } else {
-            idx += 2;
-            continue;
-         }
-      }
-
-      new_seq[new_idx] = old_seq[idx];
-      idx++;
-      new_idx++;
-   }
-
-   new_seq[new_idx] = '\0';
-   //printf("new_seq is %d :: %s\n",new_idx,new_seq);
-   return est_deletions_ctr;
-}
diff --git a/tools/dataset_scripts/countNotOnChr1-5_pos.py b/tools/dataset_scripts/countNotOnChr1-5_pos.py
deleted file mode 100644 (file)
index cd65731..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*- 
-
-import sys
-import qparser
-
-def check(filename):
-   #all_filtered_reads = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
-   #qparser.parse_reads(all_filtered_reads)
-
-   map = {}
-   
-   for line in open(filename):
-
-      sl = line.split()
-      id    = int(sl[0])
-      chromo= int(sl[1])
-      strand = sl[3]
-      if strand == 'D':
-         strand = '+'
-      if strand == 'P':
-         strand = '-'
-
-      if map.has_key(id):
-         current_strand,current_chromo = map[id]
-         if current_strand == '+' and current_chromo in range(1,6):
-            continue
-         else:
-            map[id] = (strand,chromo)
-      else:
-         map[id] = (strand,chromo)
-
-   total_ctr = 0
-   spliced_ctr = 0
-   correct_ctr = 0
-   correct_spliced  = 0
-   correct_unspliced = 0
-
-   for id,elem in map.iteritems():
-      current_strand,current_chromo = elem
-      if id < 1000000300000:
-         spliced_ctr += 1 
-      #print elem
-      if current_strand == '+' and current_chromo in range(1,6):
-         correct_ctr += 1
-         if id < 1000000300000:
-            correct_spliced += 1
-         else:
-            correct_unspliced += 1
-      total_ctr += 1
-
-   print 'total ctr %d' % total_ctr 
-   print 'spliced_ctr %d ' % spliced_ctr
-
-   print 'total correct %d' % correct_ctr 
-   print 'spliced %d ' % correct_spliced
-   print 'unspliced %d ' % correct_unspliced
-
-
-if __name__ == '__main__':
-   check(sys.argv[1])
diff --git a/tools/dataset_scripts/dataStatistic.py b/tools/dataset_scripts/dataStatistic.py
deleted file mode 100644 (file)
index 9a24597..0000000
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*- 
-
-import sys
-
-#
-#
-#
-
-class Data:
-   pass
-
-
-
-
-class DataStatistic:
-
-   def __init__(self,reads_fn,map_fn,map2_fn,heuristic_fn,heuristic_un_fn):
-      self.reads_fn = reads_fn
-      self.map_fn = map_fn
-      self.map2_fn = map2_fn
-      self.heuristic_fn = heuristic_fn
-      self.heuristic_un_fn = heuristic_un_fn
-
-      #self.all_read_ids_set  = set([])
-      #self.spliced_ids_set   = set([])
-      #self.unspliced_ids_set = set([])
-      #self.map_ids_set       = set([])
-      #self.map_2nd_ids_set   = set([])
-
-
-   def calcStat(self,filename,check_strand):
-      spliced_set    = set([])
-      unspliced_set  = set([])
-      all_set        = set([])
-
-      for line in open(filename):
-         sl = line.strip().split()
-         id = int(sl[0])
-         chromo = int(sl[1])
-         strand = sl[3]
-
-         if check_strand and ((not chromo in range(1,6)) or strand != 'D'):
-            continue
-
-         if id < 1000000300000:
-            spliced_set.add(id)
-         else:
-            unspliced_set.add(id)
-
-         all_set.add(id)
-
-      return spliced_set,unspliced_set,all_set
-
-
-   def calculate(self):
-      all_spliced_set,all_unspliced_set,all_set = self.calcStat(self.reads_fn,False)
-      res = (len(all_spliced_set),len(all_unspliced_set),len(all_set),'total reads')
-      self.printResult(res)
-
-      check_switch = True
-      map_spliced_set,map_unspliced_set,map_set =\
-      self.calcStat(self.map_fn,check_switch)
-      res = (len(map_spliced_set),len(map_unspliced_set),len(map_set),'map reads')
-      self.printResult(res)
-
-      map2_spliced_set,map2_unspliced_set,map2_set =\
-      self.calcStat(self.map2_fn,check_switch)
-      res = (len(map2_spliced_set),len(map2_unspliced_set),len(map2_set),'map2 reads')
-      self.printResult(res)
-
-      heuristic_spliced_set,heuristic_unspliced_set,heuristic_set =\
-      self.calcStat(self.heuristic_fn,check_switch)
-      res = (len(heuristic_spliced_set),len(heuristic_unspliced_set),len(heuristic_set),'heuristic reads')
-      self.printResult(res)
-
-      heuristic_un_spliced_set,heuristic_un_unspliced_set,heuristic_un_set =\
-      self.calcStat(self.heuristic_un_fn,check_switch)
-      res = (len(heuristic_un_spliced_set),len(heuristic_un_unspliced_set),len(heuristic_un_set),'heuristic_un reads')
-      self.printResult(res)
-
-      all_found_set = map_set | map2_set
-      print 'found %d reads' % len(all_found_set)
-
-      not_found_set = all_set - all_found_set
-      print 'could not find %d reads' % len(not_found_set)
-
-      all_found_spliced_set = map_spliced_set | map2_spliced_set
-      print 'found %d spliced reads' % len(all_found_spliced_set)
-
-      not_found_spliced_set = all_spliced_set - all_found_spliced_set
-      print 'could not find %d spliced reads' % len(not_found_spliced_set)
-
-
-      all_found_unspliced_set = map_unspliced_set | map2_unspliced_set
-      not_found_unspliced_set = all_unspliced_set - all_found_unspliced_set 
-      print 'could not find %d unspliced reads' % len(not_found_unspliced_set)
-      for i in range(10):
-         print not_found_unspliced_set.pop()
-
-
-   def printResult(self,res):
-      mes = '%s:\t%d\n' % (res[3],res[2])
-      mes += 'spliced reads:\t%d (%f)\n' % (res[0],(1.0*res[0])/res[2])
-      mes += 'unspliced reads:\t%d (%f)\n' % (res[1],(1.0*res[1])/res[2])
-      print mes
-
-
-if __name__ == '__main__':
-
-   ds = DataStatistic(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5])
-   ds.calculate()
diff --git a/tools/dataset_scripts/mappingInfo.sh b/tools/dataset_scripts/mappingInfo.sh
deleted file mode 100755 (executable)
index f8b34bc..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-dir=$1
-
-num=`cut -f1 $dir/main/reads_0.fl | sort -u | wc -l`
-echo "Started 1st vmatch run with $num reads."
-
-num=`cut -f1 $dir/main/map.vm | sort -u | wc -l`
-echo "Found $num reads in 1st vmatch run."
-
-num=`cut -f1 $dir/spliced/reads_0.fl | sort -u | wc -l`
-echo "Started 2nd vmatch run with $num reads."
-
-num=`cut -f1 $dir/spliced/map.vm | sort -u | wc -l`
-echo "Found $num reads in 2nd vmatch run"
diff --git a/tools/evaluatePrediction.py b/tools/evaluatePrediction.py
deleted file mode 100644 (file)
index ee8e9af..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python
-# -*- coding:latin-1 -*-
-
-import sys
-import cPickle
-
-ground_truth   = cPickle.load(sys.argv[1])
-prediction     = cPickle.load(sys.argv[2])
-
-ESTs     = ground_truth['TestEsts']
-Exons    = ground_truth['TestExon']
-Acc      = ground_truth['TestAcc']
-Don      = ground_truth['TestDon']
-DNA      = ground_truth['Test']
-
-totalWrongPos = [0]*len(prediction)
-
-for pos,alignmentObj in enumerate(prediction):
-   predPos  = zips(alignmentObj.tStarts, alignmentObj.tEnds)
-   wrongPos  = 0
-
-   for predExonStart,predExonStop in predPos:
-
-      for pos,currentExons in enumerate(Exons):
-         for idx in range(1,len(currentExons)-1):
-
-            intronStart = currentExons[idx][1]
-            intronStop  = currentExons[idx+1][0]
-
-            # est is covering full intron
-            if intronStart > predExonStart and intronStop < predExonStop:
-               #fprintf(fh,sprintf(' %s is completely overlapping from %d %d\n',currentESTName,intronStart,intronStop));
-               wrongPos += intronStop-intronStart
-            # est is nested inside intron
-            elif intronStart < predExonStart and intronStop > predExonStop:
-               #fprintf(fh,sprintf(' %s is completely inside intron from %d %d\n',currentESTName,predExonStart,predExonStop));
-               wrongPos += intronStop-intronStart
-            # end of exonth is nested inside predExoniction
-            elif intronStart > predExonStart and predExonStop > intronStart and intronStop > predExonStop:
-               #fprintf(fh,sprintf('%s is upstream overlapping from %d %d\n',currentESTName,intronStart,predExonStop));
-               wrongPos += intronStop-intronStart
-            # predExoniction is nested inside exonth
-            elif intronStart < predExonStart and predExonStart < intronStop and intronStop < predExonStop:
-               #fprintf(fh,sprintf('%s is downstream overlapping from %d %d\n',currentESTName,predExonStart,intronStop));
-               wrongPos += intronStop-intronStart
-            else:
-               pass
-   
-   totalWrongPositions[pos] = wrongPos
diff --git a/tools/extractESTs b/tools/extractESTs
deleted file mode 100755 (executable)
index f59f6d1..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-threshold=100
-
-cat $1 | sort | uniq > $1.uniq
-python calculateSizes $1.uniq | sort -rn | grep '$[0-9]{3,3}' > $1.uniq.over.${threshold}
diff --git a/tools/plot_param.py b/tools/plot_param.py
deleted file mode 100644 (file)
index 6a43fd5..0000000
+++ /dev/null
@@ -1,112 +0,0 @@
-
-"""A tool to visualise the parameter vector of QPalma"""
-
-import pylab
-import cPickle
-import numpy
-import sys
-
-import qpalma
-from qpalma.set_param_palma import set_param_palma
-import qpalma.Configuration as Configuration
-
-# remember to check for consistency with Configuration.py
-
-# some variables for testing
-#filename = 'param_1.pickle'
-#train_with_intronlengthinformation = 1
-
-
-
-def plot_param(filename,train_with_intronlengthinformation=1):
-    """Load filename, and plot 3 figures:
-       - match matrix
-       - acceptor and donor scores
-       - intron length scores
-    """
-    assert(Configuration.mode == 'using_quality_scores')
-    param = cPickle.load(open(filename))
-    [h,d,a,mmatrix,qualityPlifs] =\
-    set_param_palma(param,train_with_intronlengthinformation,None)
-
-    # construct the substitution matrix using the last value of qualityPlif
-    substmat = numpy.matlib.zeros((1,(Configuration.estPlifs+1)*Configuration.dnaPlifs))
-    substmat[0,:Configuration.sizeMatchmatrix[0]] = mmatrix.T
-    
-    for sidx,plif in enumerate(qualityPlifs):
-        substmat[0,sidx+Configuration.sizeMatchmatrix[0]] = plif.penalties[-1]
-
-    substmat = numpy.reshape(substmat,(Configuration.estPlifs+1,Configuration.dnaPlifs))
-
-    # set up default parameters
-    pylab.rcParams.update({'xtick.labelsize' : 20,
-                           'ytick.labelsize' : 20,
-                           'legend.fontsize' : 20,
-                           })
-
-    # plot the substitution matrix
-    fig1=pylab.imshow(substmat,cmap=pylab.cm.summer,interpolation='nearest')
-    pylab.colorbar()
-    fig1.axes.xaxis.set_ticks_position('top')
-    fig1.axes.xaxis.set_label_position('top')
-    pylab.xticks( numpy.arange(0.5,6,1), ('-', 'A', 'C', 'G', 'T', 'N') )
-    pylab.yticks( numpy.arange(5.5,0,-1), ('-', 'A', 'C', 'G', 'T', 'N') )
-    pylab.xlabel('DNA',fontsize=20)
-    pylab.ylabel('EST',fontsize=20)
-
-    pylab.savefig('matrix.eps')
-    pylab.clf()
-    
-    # plot donor and acceptor transformations
-    pylab.figure()
-    pylab.plot(a.limits,a.penalties,'bd-',label='Acceptor',hold=True)
-    pylab.plot(d.limits,d.penalties,'ro--',label='Donor')
-    pylab.xlabel('SVM output',fontsize=20)
-    pylab.ylabel('transition score',fontsize=20)
-    pylab.legend()
-
-    pylab.savefig('donacc.eps')
-    pylab.clf()
-
-    # plot intron length transformations
-    pylab.figure()
-    pylab.plot(h.limits,h.penalties,'k+-')
-    pylab.xlabel('intron length',fontsize=20)
-    pylab.ylabel('transition score',fontsize=20)
-
-    pylab.savefig('intron_len.eps')
-    pylab.clf()
-
-    # plot quality score transformations
-    pylab.figure()
-    for pos,plif in enumerate(qualityPlifs):
-      if pos in [1,8,15,22,7,13,19]:
-      #if pos in [3,9,21]:
-         pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
-
-      #pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
-    pylab.xlabel('quality value',fontsize=20)
-    pylab.ylabel('transition score',fontsize=20)
-
-    pylab.savefig('quality.eps')
-    pylab.clf()
-
-    #pylab.show()
-    
-
-if __name__ == '__main__':
-    twil = 1
-    if len(sys.argv) < 2:
-        print 'usage: python plot_param.py paramfile.pickle [train_with_intronlengthinformation]'
-        exit()
-    elif len(sys.argv) == 3:
-        twil = sys.argv[2]
-    filename = sys.argv[1]
-
-    plot_param(filename,twil)
-    try:
-        dummy = input('Hit enter when finished.')
-    except:
-        exit()
-
-
diff --git a/tools/run_specific_scripts/training_dataset/compile_dataset.py b/tools/run_specific_scripts/training_dataset/compile_dataset.py
deleted file mode 100644 (file)
index a125664..0000000
+++ /dev/null
@@ -1,435 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import random
-import os
-import re
-import pdb
-import cPickle
-
-import numpy
-from numpy.matlib import mat,zeros,ones,inf
-
-import qpalma
-import qpalma.tools
-
-from qpalma.parsers import *
-
-from Genefinding import *
-
-from genome_utils import load_genomic
-
-import qpalma.Configuration as Conf
-
-
-class DatasetGenerator:
-   """
-
-   """
-   
-   def __init__(self,filtered_reads,map_file,map_2nd_file):
-      assert os.path.exists(filtered_reads), 'Error: Can not find reads file'
-      self.filtered_reads = filtered_reads
-
-      assert os.path.exists(map_file), 'Error: Can not find map file'
-      assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
-      self.map_file = map_file
-      self.map_2nd_file = map_2nd_file
-
-      self.training_set = []
-      self.testing_set  = []
-
-      print 'parsing filtered reads..'
-      self.all_filtered_reads = parse_filtered_reads(self.filtered_reads)
-      print 'found %d filtered reads' % len(self.all_filtered_reads)
-
-
-   def saveAs(self,dataset_file):
-      assert not os.path.exists(dataset_file), 'The data_file already exists!'
-
-      # saving new datasets
-
-      #all_keys = self.training_set.keys()
-      #random.shuffle(all_keys)
-      #training_keys = all_keys[0:10000]
-      #cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
-      #cPickle.dump(training_keys,open('%s.train_keys.pickle'%dataset_file,'w+'),protocol=2)
-
-      cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
-
-      prediction_keys = [0]*len(self.testing_set.keys())
-      for pos,key in enumerate(self.testing_set.keys()):
-         prediction_keys[pos] = key
-
-      cPickle.dump(prediction_keys,open('%s.test_keys.pickle'%dataset_file,'w+'),protocol=2)
-
-
-   def compile_training_set(self):
-      # this stores the new dataset
-      dataset = {}
-
-      # Iterate over all remapped reads in order to generate for each read a
-      # training / prediction example
-      instance_counter = 1
-      skipped_ctr = 0
-
-      for id,filteredRead in self.all_filtered_reads.items():
-         if instance_counter % 1001 == 0:
-            print 'processed %d examples' % instance_counter
-
-         # training set consists only of spliced reads
-         if not id < 1000000300000:
-            continue 
-
-         if filteredRead['strand'] != '+':
-            skipped_ctr += 1
-            continue
-
-         if not filteredRead['chr'] in range(1,6):
-            skipped_ctr += 1
-            continue
-
-         # we cut out the genomic region for training 
-         CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
-         genomicSeq_start  = filteredRead['p_start'] - CUT_OFFSET
-         genomicSeq_stop   = filteredRead['p_stop'] + CUT_OFFSET
-
-         # information of the full read we need for training
-         chromo         = filteredRead['chr']
-         strand         = filteredRead['strand']
-         original_est   = filteredRead['seq']
-         quality        = filteredRead['prb']
-         cal_prb        = filteredRead['cal_prb']
-         chastity       = filteredRead['chastity']
-
-         currentExons = zeros((2,2),dtype=numpy.int)
-         currentExons[0,0] = filteredRead['p_start']
-         currentExons[0,1] = filteredRead['exon_stop']
-         currentExons[1,0] = filteredRead['exon_start']
-         currentExons[1,1] = filteredRead['p_stop']
-
-         # add instance to set
-         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-         currentQualities = (quality,cal_prb,chastity)
-
-         dataset[id] = (currentSeqInfo,currentExons,original_est,currentQualities)
-
-         instance_counter += 1
-
-      print 'Full dataset has size %d' % len(dataset)
-      print 'Skipped %d reads' % skipped_ctr
-
-      self.training_set = dataset
-      
-
-   def parse_map_file(self,dataset,map_file):
-      strand_map = ['-','+']
-      instance_counter = 1
-
-      for line in open(map_file):
-         if instance_counter % 1001 == 0:
-            print 'processed %d examples' % instance_counter
-
-         line  = line.strip()
-         slist = line.split()
-         id    = int(slist[0])
-         chromo   = int(slist[1])
-         pos      = int(slist[2])
-         strand   = slist[3]
-         strand   = strand_map[strand == 'D']
-
-         if strand != '+':
-            continue
-
-         if not chromo in range(1,6):
-            continue
-
-         genomicSeq_start = pos - 1500
-         genomicSeq_stop  = pos + 1500
-
-         # fetch missing information from original reads
-         filteredRead = self.all_filtered_reads[id]
-         original_est = filteredRead['seq']
-         original_est = original_est.lower()
-
-         original_est = filteredRead['seq']
-         prb     = filteredRead['prb']
-         cal_prb = filteredRead['cal_prb']
-         chastity = filteredRead['chastity']
-
-         # add instance to set
-         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-         currentQualities = (prb,cal_prb,chastity)
-
-         # as one single read can have several vmatch matches we store all
-         # these matches under the unique id of the read
-         if dataset.has_key(id):
-            old_entry = dataset[id]
-            old_entry.append((currentSeqInfo,original_est,currentQualities))
-            dataset[id] = old_entry
-         else:
-            dataset[id] = [(currentSeqInfo,original_est,currentQualities)]
-
-         instance_counter += 1
-
-      print 'Total examples processed: %d' % instance_counter
-
-      return dataset
-
-
-   def compile_testing_set(self):
-
-      dataset = {}
-
-      # usually we have two files to parse:
-      # the map file from the second run and a subset of the map file from the
-      # first run
-      dataset = self.parse_map_file(dataset,self.map_file)
-      dataset = self.parse_map_file(dataset,self.map_2nd_file)
-
-      # store the full set
-      self.testing_set = dataset
-
-
-def compile_dataset_direct(filtered_reads,dataset_file):
-
-   strand_map = ['-','+']
-
-   SeqInfo = []
-   OriginalEsts = []
-   Qualities = []
-
-   instance_counter = 0
-
-   for line in open(filtered_reads):
-      line  = line.strip()
-      slist = line.split()
-      id    = int(slist[0])
-
-      if not id < 1000000300000:
-         continue
-
-      if instance_counter % 1000 == 0:
-         print 'processed %d examples' % instance_counter
-
-      chr      = int(slist[1])
-      strand   = slist[2]
-      strand   = strand_map[strand == 'D']
-
-      genomicSeq_start = int(slist[10]) - 1000
-      genomicSeq_stop  = int(slist[13] ) + 1000
-
-      original_est = slist[3]
-      original_est = original_est.lower()
-      #print original_est
-
-      prb      = [ord(elem)-50 for elem in slist[6]]
-      cal_prb  = [ord(elem)-64 for elem in slist[7]]
-      chastity = [ord(elem)+10 for elem in slist[8]]
-
-      #pdb.set_trace()
-
-      # add instance to set
-      SeqInfo.append((id,chr,strand,genomicSeq_start,genomicSeq_stop))
-      OriginalEsts.append(original_est)
-      Qualities.append( (prb,cal_prb,chastity) )
-
-      instance_counter += 1
-
-   dataset = [SeqInfo, OriginalEsts, Qualities ]
-
-   # saving new dataset
-   cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
-
-
-
-
-def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
-   """
-   Now we want to use interval_query to get the predicted splice scores trained
-   on the TAIR sequence and annotation.
-   """
-
-   size = intervalEnd-intervalBegin
-   assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
-
-   acc = size*[0.0]
-   don = size*[0.0]
-
-   interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
-   pos_size = new_intp()
-   intp_assign(pos_size,1)
-
-   # fetch acceptor scores
-   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
-   acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
-   # fetch donor scores
-   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
-   don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
-   return acc, don
-
-
-def process_map(currentRRead,fRead):
-   """
-   For all matches found by Vmatch we calculate the fragment of the DNA we
-   would like to perform an aligment during prediction.
-   """
-
-   fragment_size = 3000
-
-   alternativeSeq = []
-
-   onePositiveLabel = False
-
-   rId      = currentRRead['id']
-   pos      = currentRRead['pos']
-   chr      = currentRRead['chr']
-   strand   = currentRRead['strand']
-
-   length   = currentRRead['length']
-   offset   = currentRRead['offset']
-
-   CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
-
-   # vmatch found the correct position
-   if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
-      genomicSeq_start = fRead['p_start'] - CUT_OFFSET
-      genomicSeq_stop = fRead['p_stop'] + CUT_OFFSET
-   else:
-      genomicSeq_start = pos - (fragment_size/2)
-      genomicSeq_stop = pos + (fragment_size/2)
-
-   return (rId,chr,strand,genomicSeq_start,genomicSeq_stop)
-
-
-def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
-   """
-   This function expects an interval, chromosome and strand information and
-   returns then the genomic sequence of this interval and the associated scores.
-   """
-
-   assert genomicSeq_start < genomicSeq_stop
-
-   chrom         = 'chr%d' % chr
-   genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
-   genomicSeq = genomicSeq.lower()
-
-   # check the obtained dna sequence
-   assert genomicSeq != '', 'load_genomic returned empty sequence!'
-   
-   # all entries other than a c g t are set to n
-   no_base = re.compile('[^acgt]')
-   genomicSeq = no_base.sub('n',genomicSeq)
-
-   intervalBegin  = genomicSeq_start-100
-   intervalEnd    = genomicSeq_stop+100
-   seq_pos_offset = genomicSeq_start
-
-   currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
-
-   currentAcc = currentAcc[100:-98]
-   currentAcc = currentAcc[1:]
-   currentDon = currentDon[100:-100]
-
-   length = len(genomicSeq)
-   currentAcc = currentAcc[:length]
-
-   currentDon = currentDon+[-inf]*(length-len(currentDon))
-
-   ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
-   gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
-
-   assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
-   assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
-   assert len(currentAcc) == len(currentDon)
-
-   return genomicSeq, currentAcc, currentDon
-
-
-def reverse_complement(seq):
-   map = {'a':'t','c':'g','g':'c','t':'a'}
-
-   new_seq = [map[elem] for elem in seq]
-   new_seq.reverse()
-   new_seq = "".join(new_seq)
-
-   return new_seq
-
-
-def get_seq(begin,end,exon_end):
-   """
-   """
-
-   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
-   if exon_end:
-      gene_start = begin
-      gene_stop  = end+2
-   else:
-      gene_start = begin-2
-      gene_stop  = end
-
-   chrom         = 'chr%d' % 1
-   strand        = '+'
-
-   genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
-   genomicSeq = genomicSeq.lower()
-
-   return genomicSeq
-
-
-def parseLine(line):
-   """
-   We assume that a line has the following entries:
-   
-   read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
-
-   """
-   #try:
-   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
-   #except:
-   #   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-   #   true_cut = -1
-
-   splitpos = int(splitpos)
-   read_size = int(read_size)
-
-   seq=seq.lower()
-
-   assert strand in ['D','P']
-
-   if strand == 'D':
-      strand = '+'
-
-   if strand == 'P':
-      strand = '-'
-
-   chr = int(chr)
-
-   prb = [ord(elem)-50 for elem in prb]
-   cal_prb = [ord(elem)-64 for elem in cal_prb]
-   chastity = [ord(elem)+10 for elem in chastity]
-
-   p_start = int(p_start)
-   exon_stop = int(exon_stop)
-   exon_start = int(exon_start)
-   p_stop = int(p_stop)
-   true_cut = int(true_cut)
-
-   line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
-   'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
-   'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
-   'p_stop':p_stop,'true_cut':true_cut}
-
-   return line_d
-
-
-if __name__ == '__main__':
-   if len(sys.argv) == 1:
-      print info
-
-   assert len(sys.argv) == 6, help
-   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)
diff --git a/tools/run_specific_scripts/training_dataset/createNewDataset.py b/tools/run_specific_scripts/training_dataset/createNewDataset.py
deleted file mode 100644 (file)
index fa64382..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import qpalma.Configuration as Conf
-from compile_dataset import DatasetGenerator
-
-filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
-
-map_1_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/spliced.heuristic'
-map_2_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/map_2nd.vm'
-
-dg = DatasetGenerator(filtered_fn,map_1_fn,map_2_fn)
-#dg.compile_training_set()
-dg.compile_testing_set()
-dg.saveAs('dataset_19_05_08')
diff --git a/tools/run_specific_scripts/transcriptome_analysis/README b/tools/run_specific_scripts/transcriptome_analysis/README
deleted file mode 100644 (file)
index 2fd2254..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-----------------
-Processing Steps
-----------------
-
-
-Data Generation
----------------
-
-The data was taken from
-
-/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_40/
-
-directories 1 to 3
-
-We combined the map.vm and map_2nd.vm for all these three dirs using 'createFullMap.sh'
-
-the data was saved in dir /fml/ag-raetsch/home/fabio/tmp/transcriptome_data.
-
-
-QPalma Heuristic
-----------------
-
-Next we split up map.vm to make a parallel run of the QPalma heuristic in order
-to estimate the entries in map.vm that should be aligned by QPalma.
-
-We combine the results using 'combine_spliced_map_parts.sh'
-
-
-QPalma Dataset Generation
--------------------------
-
-Once we have the map_2nd.vm and the map.vm.spliced files we create a QPalma
-dataset using 'createNewDataset.py'
-
-
-
-
-Coverage Number Estimation
---------------------------
-
-For this purpose we need the raw fasta files for the original reads.  In the
-directory 'raw_reads' there should be a file 'reads_0.fa' containing all
-original reads in fasta format.
-
-
-
-
-New Vmatch criteria:
-
-
-Old settings 1 mismatch:
-
-/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/spliced
-
-
-2 Mismatches (stepwise trimming -3 from 35 to 20)
-/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/spliced_3
-
-
-
diff --git a/tools/run_specific_scripts/transcriptome_analysis/combine_spliced_map_parts.sh b/tools/run_specific_scripts/transcriptome_analysis/combine_spliced_map_parts.sh
deleted file mode 100755 (executable)
index 47e2d42..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-result_dir=$1
-
-result_fn=$result_dir/map.vm.spliced
-
-touch $result_fn
-rm $result_fn
-
-for chunk in `ls -1 $result_dir/*.heuristic.spliced`
-do
-   echo $chunk 
-   cat $chunk >> $result_fn
-done
diff --git a/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/Makefile b/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/Makefile
deleted file mode 100644 (file)
index 7a907e8..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-CC=gcc
-CFLAGS=-ggdb -Wall -Wshadow -lm #-pedantic
-
-#SRCS= parser.c\
-#OBJS = $(SRCS:%.cpp=%.o)
-
-all: compare.c
-       @ echo "Compiling..."
-       @ $(CC) $(CFLAGS) -o compare compare.c
-
-#@ $(CC) $(CFLAGS) -o filterReads $(OBJS) filterReads.c
-
-clean:
-       @ rm -rf *.o
-
diff --git a/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/compare.c b/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/compare.c
deleted file mode 100644 (file)
index 86e174e..0000000
+++ /dev/null
@@ -1,309 +0,0 @@
-// use macro for GNU getline
-#define _GNU_SOURCE 
-#include <stdio.h>
-
-#include <stdlib.h>
-#include <assert.h>
-#include <string.h>
-#include <unistd.h>
-#include <math.h>
-
-/*
- * This program compares two lists of intron positions stemming for example from
- * a gff file in order to evaluate predictions made by QPalma.
- *
- * It is assumed that the first file contains the ground truth and the second
- * file the predicted positions.
- *
- */
-
-int* gt_intron_starts;
-int* gt_intron_stops;
-int gt_size;
-
-int* pred_intron_starts;
-int* pred_intron_stops;
-int pred_size;
-
-int matching_values;
-
-static char* info = "Usage: ./<app name> <input 1 file name> <input 2 file name> <result file name>";
-
-static int chromo_sizes[5] = {30432563, 19705359, 23470805, 18585042, 26992728};
-
-short strand_info;
-
-
-void check_seeds() {
-
-   matching_values = 0;
-   int pred_idx, gt_idx, chromo, tmp;
-   int pos,gt_start,gt_stop,pred_start,pred_stop;
-
-   printf("sizes are %d/%d (gt/pred)\n",gt_size,pred_size);
-
-   for(pred_idx=0;pred_idx<pred_size;pred_idx++) {
-
-      pred_start = pos-1500;
-      pred_stop = pos+1500;
-      pos = pred_intron_stops[pred_idx];
-      chromo = pred_intron_starts[pred_idx];
-
-      for(gt_idx=0;gt_idx<gt_size;gt_idx++) {
-
-         gt_start = gt_intron_starts[gt_idx];
-         gt_stop = gt_intron_stops[gt_idx];
-
-         if(strand_info == 0) {
-            tmp = gt_start;
-            gt_start = chromo_sizes[chromo] - gt_stop;
-            gt_stop =  chromo_sizes[chromo] - tmp;
-         }
-
-         //printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
-         if ( pred_start <= gt_start && gt_stop <= pred_stop ) {
-            matching_values++;
-            break;
-         }
-
-         //if (gt_start == pred_start && gt_stop == pred_stop)
-         //   matching_values++;
-   }}
-}
-
-
-void compare_introns() {
-
-   matching_values = 0;
-   int pred_idx, gt_idx;
-   int gt_start,gt_stop,pred_start,pred_stop;
-
-   printf("sizes are %d/%d (gt/pred)\n",gt_size,pred_size);
-
-   for(pred_idx=0;pred_idx<pred_size;pred_idx++) {
-
-      pred_start = pred_intron_starts[pred_idx];
-      pred_stop = pred_intron_stops[pred_idx];
-
-      for(gt_idx=0;gt_idx<gt_size;gt_idx++) {
-
-         gt_start = gt_intron_starts[gt_idx];
-         gt_stop = gt_intron_stops[gt_idx];
-
-         //printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
-         if ( fabs(gt_start - pred_start) <= 2 && fabs(gt_stop - pred_stop) <= 2) {
-            matching_values++;
-            break;
-         }
-         //if (gt_start == pred_start && gt_stop == pred_stop)
-         //   matching_values++;
-   }}
-}
-
-
-/* 
- * This function compares the intron boundaries of the two files.
- *
- */
-
-void _compare_introns() {
-
-   matching_values = 0;
-   
-   int gt_idx = 0;
-   int pred_idx = 0;
-   int gt_start,gt_stop,pred_start,pred_stop;
-
-   printf("sizes are %d/%d\n",gt_size,pred_size);
-
-   while(1) {
-      if (gt_idx == gt_size || pred_idx == pred_size)
-         break;
-
-      gt_start = gt_intron_starts[gt_idx];
-      gt_stop = gt_intron_stops[gt_idx];
-
-      pred_start = pred_intron_starts[pred_idx];
-      pred_stop = pred_intron_stops[pred_idx];
-
-      //printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
-      if (gt_start == pred_start && gt_stop == pred_stop) {
-         matching_values++;
-
-         // take doubles into account
-#if 1
-         int idx = pred_idx+1;
-         for(;idx<pred_size;idx++) {
-            if (pred_intron_starts[idx] == pred_start && pred_intron_stops[idx] == pred_stop)
-               matching_values++;
-            else
-               break;
-         }
-#endif
-         gt_idx++;
-         pred_idx++;
-         continue;
-      }
-
-      // check for all possible overlappings / positions
-      if ( gt_stop <= pred_start ) {
-         gt_idx++;
-         continue;
-      }
-
-      if ( pred_stop <= gt_start ) {
-         pred_idx++;
-         continue;
-      }
-
-      if (gt_stop <= pred_stop && gt_stop >= pred_start) {
-         gt_idx++;
-         continue;
-      }
-
-      if (pred_stop <= gt_stop && pred_stop >= gt_start) {
-         pred_idx++;
-         continue;
-      }
-   }
-}
-
-
-/*
- *
- *
- *
- */
-
-void load_introns(char* fn, int** starts, int** stops, int* size) {
-
-   FILE *fs = fopen(fn,"r");
-   if(fs == NULL) {
-      printf("Error: Could not open file: %s",fn);
-      exit(EXIT_FAILURE);
-   }
-
-   size_t line_size = 256;
-   char* current_line = malloc(sizeof(char)*line_size);
-   size_t status;
-
-   // we count the number of lines the file has
-   *size = 0;
-   while( getline(&current_line,&line_size,fs) != -1 ) (*size)++;
-
-   // now we allocate memory to store all positions
-   //*id      = malloc(sizeof(char)*(*size));
-   *starts  = malloc(sizeof(int)*(*size));
-   *stops   = malloc(sizeof(int)*(*size));
-
-   if ( *starts == NULL || *stops == NULL )
-      perror("Could not allocate memory for position arrays");
-
-   fs = freopen(fn,"r",fs);
-   if(fs == NULL) {
-      printf("Error: Could not open file: %s",fn);
-      exit(EXIT_FAILURE);
-   }
-
-   //char* id_string  = malloc(sizeof(char)*line_size);
-   char* first_num  = malloc(sizeof(char)*line_size);
-   char* second_num = malloc(sizeof(char)*line_size);
-
-   int idx;
-   int intron_start;
-   int intron_stop;
-   int ctr = 0;
-
-   while(1) { 
-      status = getline(&current_line,&line_size,fs);
-      if ( status == -1 )
-         break;
-      
-      //printf("%s",current_line);
-      
-      for(idx=0;idx<line_size;idx++) {
-         if (current_line[idx] == ' ')
-            break;
-      }
-      strncpy(first_num,current_line,idx);
-      strncpy(second_num,&current_line[idx],line_size-idx);
-      intron_start = atoi(first_num);
-      intron_stop = atoi(second_num);
-      //printf("start/stop: %d/%d\n",intron_start,intron_stop);
-    
-      (*starts)[ctr] = intron_start;
-      (*stops)[ctr]  = intron_stop;
-      ctr++;
-   }
-
-   //printf("ctr is %d\n",ctr);
-
-   if (fclose(fs) != 0)
-      perror("Closing of filestream failed!");
-}
-
-
-int main(int argc, char* argv[]) {
-
-   if(argc != 6) {
-      printf("%s\n",info);
-      exit(EXIT_FAILURE);
-   }
-
-   int filenameSize = 256;
-   char* gt_fn    = malloc(sizeof(char)*filenameSize);
-   char* pred_fn  = malloc(sizeof(char)*filenameSize);
-   char* result_fn= malloc(sizeof(char)*filenameSize);
-
-   if ( gt_fn == NULL || pred_fn == NULL || result_fn == NULL)
-      perror("Could not allocate memory for filenames");
-
-   short intron_mode = -1;
-   if ( strcmp(argv[1],"-intron") == 0 )
-      intron_mode = 1;
-
-   if ( strcmp(argv[1],"-seed") == 0 )
-      intron_mode = 0;
-
-   if (intron_mode == -1)
-      perror("You have to choose between -intron or -seed mode!");
-
-   strand_info = -1;
-   if ( strcmp(argv[2],"-strand=+") == 0 )
-      strand_info = 1;
-
-   if ( strcmp(argv[2],"-strand=-") == 0 )
-      strand_info = 0;
-
-   if (strand_info == -1)
-      perror("You have to choose between -strand=+ or -strand=- !");
-
-   strncpy(gt_fn,argv[3],strlen(argv[3]));
-   strncpy(pred_fn,argv[4],strlen(argv[4]));
-   strncpy(result_fn,argv[5],strlen(argv[5]));
-
-   FILE *result_fs = fopen(result_fn,"w+");
-   if(result_fs == NULL) {
-      printf("Error: Could not open file: %s",result_fn);
-      exit(EXIT_FAILURE);
-   }
-
-   load_introns(gt_fn,&gt_intron_starts,&gt_intron_stops,&gt_size);
-   load_introns(pred_fn,&pred_intron_starts,&pred_intron_stops,&pred_size);
-
-   if (intron_mode == 1)
-      compare_introns();
-   else
-      check_seeds();
-
-   int f_status = fclose(result_fs);
-   if(f_status != 0)
-      printf("closing of result filestream failed!\n");
-
-   free(gt_fn);
-   free(pred_fn);
-   free(result_fn);
-
-   printf("Found %d matching intron(s)/seed(s).\n",matching_values);
-   exit(EXIT_SUCCESS);
-}
diff --git a/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/gt_test b/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/gt_test
deleted file mode 100644 (file)
index 89a7b1c..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-1 10
-10 12
-12 14
-20 22
diff --git a/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/pred_test b/tools/run_specific_scripts/transcriptome_analysis/compare_predictions/pred_test
deleted file mode 100644 (file)
index 3c90466..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-1 10
-1 10
-1 10
-10 14
-10 14
-10 14
-14 20
-20 22
-20 22
diff --git a/tools/run_specific_scripts/transcriptome_analysis/compile_dataset.py b/tools/run_specific_scripts/transcriptome_analysis/compile_dataset.py
deleted file mode 100644 (file)
index 7cc028c..0000000
+++ /dev/null
@@ -1,195 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import array
-import cPickle
-import os
-import pdb
-import random
-import sys
-
-import qpalma
-import qpalma.tools
-
-from qpalma.parsers import *
-from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
-
-import QPalmaConfiguration as Conf
-
-#
-# This script takes as input the map.vm map_2nd.vm files and generates QPalma
-# dataset pickle files
-#
-
-
-class DatasetGenerator:
-   """
-   This class wraps the dataset generation for the matches of the second vmatch
-   run and the heuristic-filtered matches of the first run.
-
-   An example in our dataset consists of:
-
-   - information on the original read:
-         * id
-         * nucleotide sequence
-         * quality vectors (at the moment: prb and chastity)
-
-   - information on the target dna sequence:
-         * chromosome
-         * strand
-         * fragment start 
-         * fragment end positions
-
-   this information is stored in tuples which are stored then in a dictionary
-   indexed by the reads unique id.
-
-   CAUTION: The dictionary stores a list of the above defined tuples under each
-   'id'. This is because one read can be found several times on the genome.
-   """
-
-   def __init__(self,map_file,map_2nd_file):
-      assert os.path.exists(map_file), 'Error: Can not find map file'
-      print map_2nd_file
-      assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
-      self.map_file = map_file
-      self.map_2nd_file = map_2nd_file
-
-      self.dataset = []
-
-      self.read_size = 38
-      #self.prb_offset = 50
-      self.prb_offset = 64
-
-      self.half_window_size = 1500
-
-
-   def saveAs(self,dataset_file):
-      dataset_fn = '%s.pickle'% dataset_file
-      dataset_keys_fn = '%s.keys.pickle'%dataset_file
-
-      print dataset_fn, dataset_keys_fn 
-
-      assert not os.path.exists(dataset_fn), 'The data_file already exists!'
-      assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
-
-      #pdb.set_trace()
-      # saving new dataset and single keys as well
-      cPickle.dump(self.dataset,open(dataset_fn,'w+'),protocol=2)
-      cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
-
-
-   def parse_map_file(self,dataset,map_file,first_round):
-      instance_counter = 0
-
-      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
-
-      g_fmt = 'chr%d.dna.flat'
-      s_fmt = 'contig_%d%s'
-
-      num_chromo = 6
-
-      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
-      seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
-
-      not_consistent_ctr = 0
-
-      for line in open(map_file):
-         if instance_counter > 0 and instance_counter % 5000 == 0:
-            print 'processed %d examples' % instance_counter
-
-         if instance_counter == 10000:
-            print 'Not consistent ctr: %d' % not_consistent_ctr
-            #   break
-
-         slist    = line.split()
-
-         id       = int(slist[0])
-         chromo   = int(slist[1])
-         pos      = int(slist[2])
-
-         # for the time being we only consider chromosomes 1-5
-         if not chromo in range(1,6):
-            continue
-
-         # convert D/P strand info to +/-
-         if slist[3] == 'D':
-            strand = '+'
-         else:
-            strand = '-'
-
-         # QPalma uses lowercase characters
-         bracket_seq = slist[4].lower()
-         read_seq = unbracket_seq(bracket_seq)
-         reconstructed_dna_seq = reconstruct_dna_seq(bracket_seq)
-
-         if strand == '-' and first_round:
-            read_seq = reverse_complement(read_seq)
-            #reconstructed_dna_seq = reverse_complement(reconstructed_dna_seq)
-
-         assert not '[' in read_seq and not ']' in read_seq
-         
-         # we use an area of +/-  `self.half_window_size` nucleotides around the seed position
-         if pos > self.half_window_size+1:
-            us_offset = self.half_window_size
-         else:
-            us_offset = pos - 1 
-
-         if pos+self.half_window_size < seqInfo.chromo_sizes[chromo]:
-            ds_offset = self.half_window_size
-         else:
-            ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
-            
-         genomicSeq_start = pos - us_offset
-         genomicSeq_stop  = pos + ds_offset
-
-         dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
-         
-         #if strand == '-':
-         #   dna_part = dna_seq[us_offset-len(read_seq):us_offset]
-         #else:
-         #   dna_part = dna_seq[us_offset:us_offset+len(read_seq)]
-
-         #if reconstructed_dna_seq != dna_part:
-         #   not_consistent_ctr += 1
-         #   pdb.set_trace()
-
-         ## check whether everything is ok
-         #   try:
-         #      currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
-         #   except:
-         #      problem_ctr += 1
-         #      continue
-
-         # In order to save some space we use a signed char to store the
-         # qualities. Each quality element can range as follows: -128 <= elem <= 127
-         
-         prb = array.array('b',map(lambda x: ord(x)-self.prb_offset,slist[5]))
-
-         # add instance to set
-         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-         currentQualities = [prb]
-
-         # As one single read can have several vmatch matches we store all
-         # these matches under the unique id of the read
-         dataset.setdefault(id, []).append((currentSeqInfo,read_seq,currentQualities))
-
-         instance_counter += 1
-
-      print 'Total examples processed: %d' % instance_counter
-      print 'Not consistent ctr: %d' % not_consistent_ctr
-      return dataset
-
-
-   def compile_dataset(self):
-      dataset = {}
-
-      # usually we have two files to parse:
-      # the map file from the second run and a subset of the map file from the
-      # first run
-
-      dataset = self.parse_map_file(dataset,self.map_file,True)
-      dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
-
-      self.dataset = dataset
diff --git a/tools/run_specific_scripts/transcriptome_analysis/createExonInfoForGenefinding.py b/tools/run_specific_scripts/transcriptome_analysis/createExonInfoForGenefinding.py
deleted file mode 100755 (executable)
index db3200c..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import sys
-
-from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
-
-def run(chunk_dir,outfile):
-   #chunk_dir = '/fml/ag-raetsch/home/fabio/tmp/sandbox/alignment'
-
-   result_fn=os.path.join(chunk_dir,'all_chunks.txt')
-   cmd = "rm -rf %s; for chunk in `ls -1 %s/chunk*align_remap`; do cat $chunk >> %s; done"%\
-   (result_fn,chunk_dir,result_fn)
-   os.system(cmd)
-
-   out_fh = open(outfile,'w+')
-
-   # fetch the data needed
-   g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-   acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-   don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
-
-   g_fmt = 'chr%d.dna.flat'
-   s_fmt = 'contig_%d%s'
-
-   num_chromo = 6
-
-   accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
-   seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
-
-   pp = lambda x : str(x)[1:-1].replace(' ','')
-
-   for line in open(result_fn):
-      sl = line.split()
-
-      chromo = int(sl[1])
-      strand = sl[2]
-      start_pos = int(sl[5])-2
-      starts   = sl[15].split(',')
-      ends     = sl[16].split(',')
-
-      ids = sl[-2].split(',')
-      ids   = [int(elem) for elem in ids]
-      gaps = sl[-1].split(',')
-      gaps   = [int(elem) for elem in gaps]
-
-      if strand == '+':
-         pass
-
-      elif strand == '-':
-         total_size = seqInfo.chromo_sizes[chromo]
-         start_pos = total_size - start_pos
-
-         #t_size = 3001
-         #starts  = [t_size-int(e) for e in ends]
-         #starts.reverse()
-         #ends    = [t_size-int(e) for e in starts]
-         #ends.reverse()
-
-         #start_pos = total_size - start_pos
-         #starts   = [total_size - elem for elem in starts]
-         #starts.reverse()
-         #ends     = [total_size - elem for elem in ends]
-         #ends.reverse()
-
-      else:
-         assert False
-
-      starts   = [int(elem) + start_pos for elem in starts]
-      ends     = [int(elem) + start_pos for elem in ends]
-
-      if strand == '+':
-         starts   = [e+1 for e in starts]
-         ends     = [e+1 for e in ends]
-      else:
-         starts   = [e-3001 for e in starts]
-         ends     = [e-3001 for e in ends]
-
-      #out_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),str(start_pos),pp(ids),pp(gaps)))
-      out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
-
-   
-   cmd = 'rm %s' % result_fn
-   os.system(cmd)
-
-if __name__ == '__main__':
-   run(sys.argv[1],sys.argv[2])
diff --git a/tools/run_specific_scripts/transcriptome_analysis/createFullMap.sh b/tools/run_specific_scripts/transcriptome_analysis/createFullMap.sh
deleted file mode 100755 (executable)
index af45329..0000000
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-
-# This script collects data from 
-# /media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_40/
-# and combines the map.vm files to a single map.vm resp. map_2nd.vm
-# 
-
-function combine_and_check_maps {
-   # specifies project dir
-   run_dir=$1
-   # specifies vmatch round (either main or spliced)
-   round_dir=$2
-   # this is the result file ( map.vm or map_2nd.vm)
-   new_map=$3
-   logfile=$4
-
-   echo "Starting logfile entries for $new_map" >> $logfile
-
-   current_map=$run_dir/1/length_38/${round_dir}/map.vm
-   echo "processing $current_map ..."
-   cat  $current_map > $new_map
-   lane1_read_ctr=`cut -f1 $current_map | sort -u | wc -l`
-   echo "lane1 reads $lane1_read_ctr" >> $logfile
-
-   current_map=$run_dir/2/length_38/${round_dir}/map.vm
-   echo "processing $current_map ..."
-   cat $current_map >> $new_map
-   lane2_read_ctr=`cut -f1 $current_map | sort -u | wc -l`
-   echo "lane2 reads $lane2_read_ctr" >> $logfile
-
-   current_map=$run_dir/3/length_38/${round_dir}/map.vm
-   echo "processing $current_map ..."
-   cat $current_map >> $new_map
-   lane3_read_ctr=`cut -f1 $current_map | sort -u | wc -l`
-   echo "lane3 reads $lane3_read_ctr" >> $logfile
-
-   combined_lanes_ctr=`echo "$lane1_read_ctr + $lane2_read_ctr + $lane3_read_ctr" | bc`
-   combined_read_ctr=`cut -f1 $new_map | sort -u | wc -l`
-   echo "$new_map reads $combined_read_ctr" >> $logfile
-
-   # here we check whether the number of reads of the combined file is the sum
-   # of the reads of the single lane files
-   if [ $combined_lanes_ctr -ne $combined_read_ctr ] ; then
-      echo "Error during combination of first vmatch run reads!"
-   fi
-}
-
-run_dir=/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_40
-
-#mkdir /tmp/fabio
-map_1st=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm
-map_2nd=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map_2nd.vm
-logfile=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/logfile.txt
-
-touch $logfile && rm $logfile
-
-combine_and_check_maps $run_dir "main" $map_1st $logfile
-combine_and_check_maps $run_dir "spliced" $map_2nd $logfile
diff --git a/tools/run_specific_scripts/transcriptome_analysis/createGenefindingInfo.sh b/tools/run_specific_scripts/transcriptome_analysis/createGenefindingInfo.sh
deleted file mode 100755 (executable)
index f7a6b3f..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-for((idx=1;idx<2;idx++))
-do
-   current_dir=/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_${idx}/spliced
-   input=$current_dir/alignment 
-   result=$current_dir/alignment/alignmentInfo.genefinding
-   result2=$current_dir/alignment/alignmentInfo.genefinding.chr1_only
-   result3=$current_dir/alignment/alignmentInfo.genefinding.chr1_only.sorted
-   intron_stops=$current_dir/alignment/intron_stops
-   intron_starts=$current_dir/alignment/intron_starts
-   intron_info=$current_dir/alignment/intron_info
-
-   python createExonInfoForGenefinding.py $input $result
-
-   #cat $result | grep '^1' > $result2
-   #cat $result2 | sort > $result3
-
-   for((chromo=1;chromo<6;chromo++))
-   do
-      for strand in "+" "-"
-      do
-         full_intron_stops=${intron_stops}_chr${chromo}_${strand}
-         full_intron_starts=${intron_starts}_chr${chromo}_${strand}
-         full_intron_info=${intron_info}_chr${chromo}_${strand}
-         cat $result | grep "^$chromo" | grep "        $strand " | cut -f3 | sed -e '/[,]\{1,1\}/!d' | cut -d ',' -f2 > $full_intron_stops
-         cat $result | grep "^$chromo" | grep "        $strand " | cut -f4 | sed -e '/[,]\{1,1\}/!d' | cut -d ',' -f1 > $full_intron_starts
-         paste --delimiters=" " $full_intron_starts $full_intron_stops > $full_intron_info
-         rm $full_intron_stops $full_intron_starts
-      done
-   done
-done
diff --git a/tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py b/tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py
deleted file mode 100644 (file)
index ffa07ab..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import os.path
-import sys
-import cProfile
-
-from compile_dataset import DatasetGenerator
-
-jp = os.path.join
-
-def combine_spliced_map_parts(data_dir,result_fn):
-   """
-   """
-   
-   assert os.path.exists(data_dir)
-
-   result_fn = jp(data_dir,result_fn)
-
-   if os.path.exists(result_fn):
-      os.remove(result_fn)
-
-   for chunk_fn in os.listdir(data_dir):
-      if chunk_fn.endswith('.spliced'):
-         full_chunk_fn = jp(data_dir,chunk_fn)
-         cmd = 'cat %s >>  %s' % (full_chunk_fn,result_fn)
-         os.system(cmd)
-
-   cmd = "cat %s | sed -e \'/^#/d\' > tmp ; mv tmp %s" % (result_fn,result_fn)
-   os.system(cmd)
-         
-
-def run():
-
-   #main_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
-   #spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3'
-   #result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_3/dataset'
-
-   main_dir    = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/main/'
-   spliced_dir = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/'
-   result_dir  = '/fml/ag-raetsch/share/projects/qpalma/thaliana_4_lanes/lane_4/spliced/dataset/'
-
-   combine_spliced_map_parts(main_dir,'map.vm.spliced')
-
-   map_1_fn = jp(main_dir,'map.vm.spliced')
-   map_2_fn = jp(spliced_dir,'map.vm')
-
-   dg = DatasetGenerator(map_1_fn,map_2_fn)
-   dg.compile_dataset()
-   dg.saveAs(jp(result_dir,'dataset_run_1'))
-
-if __name__ == '__main__':
-   #cProfile.run('run()')
-   run()
diff --git a/tools/spliceScoreConverter.py b/tools/spliceScoreConverter.py
new file mode 100644 (file)
index 0000000..ceeaeec
--- /dev/null
@@ -0,0 +1,42 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*-
+
+import array
+import os.path
+import sys
+
+mes = 'Usage: python spliceScoreConverter.py infile outfile'
+
+def convert2binary(in_fn,out_fn):
+   """
+   """
+
+   # count number of lines
+   file_content = open(in_fn).read()
+   size = len([e for e in file_content if e == '\n'])
+   
+   positions   = array.array('I',[0]*size)
+   scores      = array.array('f',[0.0]*size)
+
+   for idx,line in enumerate(open(in_fn)):
+      line.strip()
+      pos,score = line.split(' ')
+      pos = int(pos)
+      score = float(score)
+
+      positions[idx] = pos
+      scores[idx]    = score
+      
+   positions.tofile(open('%s.pos'%out_fn,'wb'))
+   scores.tofile(open('%s.Conf'%out_fn,'wb'))
+
+if __name__ == '__main__':
+   if len(sys.argv)-1 != 2:
+      print mes
+
+   filename = sys.argv[1]
+   assert os.path.exists(filename), mes
+
+   out_filename = sys.argv[2]
+
+   convert2binary(filename,out_filename)