+ fixed some issues with the splice site scores and ugly code fragments
authorFabio <fabio@congo.fml.local>
Mon, 13 Oct 2008 08:46:18 +0000 (10:46 +0200)
committerFabio <fabio@congo.fml.local>
Mon, 13 Oct 2008 08:46:18 +0000 (10:46 +0200)
MANIFEST.in
doc/qpalma-manual.tex
qpalma/Lookup.py
qpalma/SettingsParser.py
qpalma/gridtools.py
qpalma/qpalma_main.py
qpalma/sequence_utils.py
test.conf
tests/test_qpalma.py
tests/test_qpalma_installation.py [new file with mode: 0644]

index d6c12e3..d3efe32 100644 (file)
@@ -1,5 +1,8 @@
 recursive-include ParaParser *.cpp *.h *.i Makefile
 recursive-include DynProg *.cpp *.h *.i Makefile
 recursive-include doc *.tex Makefile
 recursive-include ParaParser *.cpp *.h *.i Makefile
 recursive-include DynProg *.cpp *.h *.i Makefile
 recursive-include doc *.tex Makefile
+recursive-include scripts *.py
+recursive-include qpalma *.py
+recursive-include tests *.py
 include test.conf
 
 include test.conf
 
index 78f011e..908dead 100644 (file)
@@ -62,7 +62,7 @@ For training \QP you need one of the following optimization toolkits:
 
 \begin{enumerate}
 \item Install the Pythongrid and the Genefinding tool packages
 
 \begin{enumerate}
 \item Install the Pythongrid and the Genefinding tool packages
-\item Update your PYTHONPATH variable to poin to the above packages
+\item Update your PYTHONPATH variable to point to the above packages
 \item Unpack the QPalma tarball via
 \item[$\rightarrow$] tar -xzvf QPalma-1.0.tar.gz
 \item Enter the QPalma-1.0 directory and type:
 \item Unpack the QPalma tarball via
 \item[$\rightarrow$] tar -xzvf QPalma-1.0.tar.gz
 \item Enter the QPalma-1.0 directory and type:
index da9365c..bc36f96 100644 (file)
@@ -8,8 +8,10 @@
 
 import sys
 import os
 
 import sys
 import os
+import pdb
+from numpy import inf
 
 
-from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper
+from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper,reverse_complement
 
 
 class LookupTable:
 
 
 class LookupTable:
@@ -23,8 +25,6 @@ class LookupTable:
       accessWrapper = DataAccessWrapper(settings)
       self.seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
 
       accessWrapper = DataAccessWrapper(settings)
       self.seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
 
-      self.strands = ['+','-']
-   
       # take care that it also works say if we get a chromo_list [1,3,5]
       # those are mapped then to 0,1,2
       self.chromo_list = chromo_list
       # take care that it also works say if we get a chromo_list [1,3,5]
       # those are mapped then to 0,1,2
       self.chromo_list = chromo_list
@@ -38,71 +38,81 @@ class LookupTable:
       self.acceptorScores     = {}
       self.donorScores        = {}
 
       self.acceptorScores     = {}
       self.donorScores        = {}
 
-      for strandId in self.strands:
-         self.genomicSequences[strandId] = [0]*(len(self.chromo_list))
-         self.acceptorScores[strandId]   = [0]*(len(self.chromo_list))
-         self.donorScores[strandId]      = [0]*(len(self.chromo_list))
+      self.genomicSequences = [0]*(len(self.chromo_list))
+      self.acceptorScoresPos   = [0]*(len(self.chromo_list))
+      self.donorScoresPos      = [0]*(len(self.chromo_list))
+      self.acceptorScoresNeg   = [0]*(len(self.chromo_list))
+      self.donorScoresNeg      = [0]*(len(self.chromo_list))
 
       for chrId in self.chromo_list:
 
       for chrId in self.chromo_list:
-         for strandId in self.strands:
-            print (chrId,strandId)
-            self.prefetch_seq_and_scores(chrId,strandId)
+         self.prefetch_seq_and_scores(chrId)
 
  
 
  
-   def get_seq_and_scores(self,chromo,strand,start,stop):
+   def get_seq_and_scores(self,chromo,strand,_start,_stop):
       assert chromo in self.chromo_list
       assert chromo in self.chromo_list
-      assert strand in self.strands
+
+      if strand == '-':
+         total_size = self.seqInfo.getFragmentSize(chromo)
+         startP = total_size - _stop
+         stopP  = total_size - _start
+         stop = stopP
+         start = startP
 
       assert start <= stop
 
       chromo_idx = self.chromo_map[chromo]
 
 
       assert start <= stop
 
       chromo_idx = self.chromo_map[chromo]
 
-      start -= 1
-      stop  -= 1
-
-      currentSequence         = self.genomicSequences[strand][chromo_idx][start:stop]
-
-      if strand == '-':
-         start += 2
-         stop  += 2
-
-      currentAcceptorScores   = self.acceptorScores[strand][chromo_idx][start:stop]
-      currentDonorScores      = self.donorScores[strand][chromo_idx][start:stop]
+      currentSequence         = self.genomicSequences[chromo_idx][start:stop]
+      if strand == '+':
+         currentAcceptorScores   = self.acceptorScoresPos[chromo_idx][start:stop]
+         currentDonorScores      = self.donorScoresPos[chromo_idx][start:stop]
+      elif strand == '-':
+         currentAcceptorScores   = self.acceptorScoresNeg[chromo_idx][_start:_stop]
+         currentDonorScores      = self.donorScoresNeg[chromo_idx][_start:_stop]
 
       if strand == '-':
 
       if strand == '-':
-         currentSequence = currentSequence[::-1]
-         currentAcceptorScores.reverse()
-         currentDonorScores.reverse()
+         currentSequence = reverse_complement(currentSequence)
 
       return currentSequence, currentAcceptorScores, currentDonorScores
 
 
 
       return currentSequence, currentAcceptorScores, currentDonorScores
 
 
-   def prefetch_seq_and_scores(self,chromo,strand):
+   def prefetch_seq_and_scores(self,chromo):
       """
       This function expects an interval, chromosome and strand information and
       returns then the genomic sequence of this interval and the associated scores.
       """
       assert chromo in self.chromo_list
       """
       This function expects an interval, chromosome and strand information and
       returns then the genomic sequence of this interval and the associated scores.
       """
       assert chromo in self.chromo_list
-      assert strand in self.strands
 
 
-      chromo_idx = self.chromo_map[chromo]
+      genomicSeq_start  = 0
+      genomicSeq_stop   = self.seqInfo.getFragmentSize(chromo)
 
 
-      genomicSeq_start  = 1
-      genomicSeq_stop   = self.seqInfo.chromo_sizes[chromo]-1
+      print 'lt total_size %d' % self.seqInfo.getFragmentSize(chromo)
 
 
-      print 'lt total_size %d' % self.seqInfo.chromo_sizes[chromo]
+      strand = '+'
+      genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,False,False)
+      print len(currentAcc),len(currentDon)
+      genomicSeq = genomicSeq.lower()
 
 
-      if strand == '-':
-         genomicSeq_stop   = self.seqInfo.chromo_sizes[chromo]+1
+      chromo_idx = self.chromo_map[chromo]
+
+      self.genomicSequences[chromo_idx] = genomicSeq
+      self.acceptorScoresPos[chromo_idx]   = currentAcc
+      self.donorScoresPos[chromo_idx]      = currentDon
 
 
+
+      strand = '-'
       genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,False,False)
       genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,False,False)
+      print len(currentAcc),len(currentDon)
       genomicSeq = genomicSeq.lower()
 
       genomicSeq = genomicSeq.lower()
 
-      if strand == '-':
-         genomicSeq = genomicSeq[::-1]
-         currentAcc.reverse()
-         currentDon.reverse()
+      newCurrentAcc = [-inf] 
+      newCurrentAcc.extend(currentAcc[:-1])
+      currentAcc = newCurrentAcc
+      newCurrentDon = [-inf] 
+      newCurrentDon.extend(currentDon[:-1])
+      currentDon = newCurrentDon
+      print len(currentAcc),len(currentDon)
+
+      self.acceptorScoresNeg[chromo_idx]   = currentAcc
+      self.donorScoresNeg[chromo_idx]      = currentDon
 
 
-      self.genomicSequences[strand][chromo_idx] = genomicSeq
-      self.acceptorScores[strand][chromo_idx]   = currentAcc
-      self.donorScores[strand][chromo_idx]      = currentDon
index 83b0858..b7a88b8 100644 (file)
@@ -51,6 +51,9 @@ def makeSettings(settings):
    settings['training_dir'] = jp(result_dir, 'training')
    settings['alignment_dir'] = jp(result_dir, 'alignment')
 
    settings['training_dir'] = jp(result_dir, 'training')
    settings['alignment_dir'] = jp(result_dir, 'alignment')
 
+   # the global logfile will be created in the result dir
+   settings['global_log_fn'] = jp(result_dir, 'qpalma.log')
+
    for dir_name in ['approximation_dir','dataset_dir', 'preproc_dir', 'postproc_dir',\
      'prediction_dir', 'training_dir', 'alignment_dir']:
       try:
    for dir_name in ['approximation_dir','dataset_dir', 'preproc_dir', 'postproc_dir',\
      'prediction_dir', 'training_dir', 'alignment_dir']:
       try:
index 54aae6d..3df842c 100644 (file)
@@ -275,7 +275,7 @@ class PostprocessingTask(ClusterTask):
          self.result_files.append(result_fn)
 
          current_job = KybJob(gridtools.PostProcessingTaskStarter,[self.settings,chunk_fn,result_fn])
          self.result_files.append(result_fn)
 
          current_job = KybJob(gridtools.PostProcessingTaskStarter,[self.settings,chunk_fn,result_fn])
-         current_job.h_vmem = '15.0G'
+         current_job.h_vmem = '2.0G'
          current_job.express = 'True'
 
          print "job #1: ", current_job.nativeSpecification
          current_job.express = 'True'
 
          print "job #1: ", current_job.nativeSpecification
index f4b8c89..a28e910 100644 (file)
@@ -493,17 +493,17 @@ class QPalma:
       # we do not need the full dataset anymore
       del dataset
    
       # we do not need the full dataset anymore
       del dataset
    
-      # Load parameter vector to predict with
-      param = cPickle.load(open(settings['prediction_param_fn']))
-
-      self.predict(prediction_set,param,settings)
+      self.predict(prediction_set,settings)
 
 
 
 
-   def predict(self,prediction_set,param,settings):
+   def predict(self,prediction_set,settings):
       """
       This method...
       """
 
       """
       This method...
       """
 
+      # Load parameter vector to predict with
+      param = cPickle.load(open(settings['prediction_param_fn']))
+
       # Set the parameters such as limits/penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] =\
       set_param_palma(param,self.ARGS.train_with_intronlengthinformation,settings)
       # Set the parameters such as limits/penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] =\
       set_param_palma(param,self.ARGS.train_with_intronlengthinformation,settings)
index e111f3c..626ecc5 100644 (file)
@@ -83,7 +83,8 @@ def get_flat_file_size(filename):
       print 'Error occurred while trying to obtain file size'
       raise DataAccessionException
 
       print 'Error occurred while trying to obtain file size'
       raise DataAccessionException
 
-   return int(out)
+   # subtract one because of wc -c output
+   return int(out)-1
 
 
 def reverse_complement(seq):
 
 
 def reverse_complement(seq):
@@ -244,13 +245,8 @@ class DataAccessWrapper:
       self.sscore_fmt         = settings['splice_score_file_fmt'] 
 
 
       self.sscore_fmt         = settings['splice_score_file_fmt'] 
 
 
-   def get_genomic_fragment_fn(self,id,strand):
-      if strand == '+':
-         return os.path.join(self.genomic_dir,self.genomic_fmt%id)
-      elif strand == '-':
-         return os.path.join(self.genomic_dir,(self.genomic_fmt%id)+'.neg')
-      else:
-         assert False
+   def get_genomic_fragment_fn(self,id):
+      return os.path.join(self.genomic_dir,self.genomic_fmt%id)
 
 
    def get_splice_site_scores_fn(self,id,strand):
 
 
    def get_splice_site_scores_fn(self,id,strand):
@@ -265,21 +261,32 @@ class SeqSpliceInfo():
    
    """
    
    
    """
    
-   def __init__(self,accessWrapper,id_list):
+   def __init__(self,accessWrapper,fragment_list):
       self.accessWrapper = accessWrapper
       
       self.chromo_sizes = {}
       self.accessWrapper = accessWrapper
       
       self.chromo_sizes = {}
+
+      # take care that it also works say if we get a chromo_list [1,3,5]
+      # those are mapped then to 0,1,2
+      self.fragment_list = fragment_list
+      self.chromo_map = {}
+      for idx,elem in enumerate(fragment_list):
+         self.chromo_map[elem] = idx
    
       print "Fetching fragment sizes..."
    
       print "Fetching fragment sizes..."
-      for chromo in id_list:
-         filename = self.accessWrapper.get_genomic_fragment_fn(chromo,'+')
+      for id in fragment_list:
+         filename = self.accessWrapper.get_genomic_fragment_fn(id)
          total_size = get_flat_file_size(filename)
          total_size = get_flat_file_size(filename)
-         self.chromo_sizes[chromo] = total_size
+         self.chromo_sizes[self.chromo_map[id]] = total_size
 
       print "done"
 
 
 
       print "done"
 
 
-   def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
+   def getFragmentSize(self,id):
+      return self.chromo_sizes[self.chromo_map[id]]
+
+
+   def getSpliceScores(self,id,strand,intervalBegin,intervalEnd,genomicSeq=''):
       """
       Now we want to use interval_query to get the predicted splice scores trained
       on the TAIR sequence and annotation.
       """
       Now we want to use interval_query to get the predicted splice scores trained
       on the TAIR sequence and annotation.
@@ -295,24 +302,21 @@ class SeqSpliceInfo():
       pos_size = new_intp()
       intp_assign(pos_size,1)
 
       pos_size = new_intp()
       intp_assign(pos_size,1)
 
-      total_size = self.chromo_sizes[chromo]
+      total_size = self.getFragmentSize(id)
 
 
-      acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(chromo,strand)
+      acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(id,strand)
 
       # fetch acceptor scores
 
       # fetch acceptor scores
-      acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,acc_fn,total_size)
+      acc = doIntervalQuery(id,strand,intervalBegin,intervalEnd,acc_fn,total_size)
 
       # fetch donor scores
 
       # fetch donor scores
-      don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,don_fn,total_size)
+      don = doIntervalQuery(id,strand,intervalBegin,intervalEnd,don_fn,total_size)
 
       return acc, don
 
 
    def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
 
       return acc, don
 
 
    def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
-      #chromo_string  = 'chr%d' % chromo
-      #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
-      filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
-      #print filename,genomicSeq_start,genomicSeq_stop
+      filename = self.accessWrapper.get_genomic_fragment_fn(chromo)
       genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
       genomicSeq = genomicSeq.strip().lower()
 
       genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
       genomicSeq = genomicSeq.strip().lower()
 
@@ -326,7 +330,7 @@ class SeqSpliceInfo():
       return genomicSeq
 
 
       return genomicSeq
 
 
-   def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
+   def get_seq_and_scores(self,id,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True):
       """
       This function expects an interval, chromosome and strand information and
       returns then the genomic sequence of this interval and the associated scores.
       """
       This function expects an interval, chromosome and strand information and
       returns then the genomic sequence of this interval and the associated scores.
@@ -336,26 +340,22 @@ class SeqSpliceInfo():
       # do not have any score predictions
       NO_SCORE_WINDOW_SIZE = 205
 
       # do not have any score predictions
       NO_SCORE_WINDOW_SIZE = 205
 
+      chromo = self.chromo_map[id]
+
       total_size = self.chromo_sizes[chromo]
 
       total_size = self.chromo_sizes[chromo]
 
-      #print genomicSeq_start,genomicSeq_stop
+      if strand == '-':
+         s_start  = total_size - genomicSeq_stop
+         s_stop   = total_size - genomicSeq_start
+         genomicSeq_start = s_start
+         genomicSeq_stop  = s_stop
 
       assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
 
 
       assert genomicSeq_start < genomicSeq_stop, pdb.set_trace()
 
-      if strand == '+':
-         s_start  = genomicSeq_start - 1
-         s_stop   = genomicSeq_stop - 1 
-
-         genomicSeq_start  -= 1
-         genomicSeq_stop   -= 1 
+      genomicSeq = self.fetch_sequence(id,strand,genomicSeq_start,genomicSeq_stop)
 
       if strand == '-':
 
       if strand == '-':
-         s_start  = total_size - genomicSeq_stop + 1
-         s_stop   = total_size - genomicSeq_start + 1 
-
-      assert genomicSeq_start < genomicSeq_stop
-
-      genomicSeq = self.fetch_sequence(chromo,strand,s_start,s_stop)
+         genomicSeq = reverse_complement(genomicSeq)
 
       if only_seq:
          return genomicSeq
 
       if only_seq:
          return genomicSeq
@@ -371,7 +371,7 @@ class SeqSpliceInfo():
       if intervalEnd == total_size-1:
          lookup_offset_end = 0
 
       if intervalEnd == total_size-1:
          lookup_offset_end = 0
 
-      currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,genomicSeq)
+      currentAcc, currentDon = self.getSpliceScores(id,strand,intervalBegin,intervalEnd,genomicSeq)
 
       if strand == '-':
          currentAcc = currentAcc[1:]
 
       if strand == '-':
          currentAcc = currentAcc[1:]
@@ -414,13 +414,13 @@ class SeqSpliceInfo():
       don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
 
       if perform_checks and not ag_tuple_pos == acc_pos:
       don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
 
       if perform_checks and not ag_tuple_pos == acc_pos:
-         print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
+         print 'ACC: Chromo/strand: %d/%s' % (id,strand)
          print ag_tuple_pos[:10],ag_tuple_pos[-10:]
          print acc_pos[:10],acc_pos[-10:]
          pdb.set_trace()
 
       if perform_checks and not gt_tuple_pos == don_pos:
          print ag_tuple_pos[:10],ag_tuple_pos[-10:]
          print acc_pos[:10],acc_pos[-10:]
          pdb.set_trace()
 
       if perform_checks and not gt_tuple_pos == don_pos:
-         print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
+         print 'DON: Chromo/strand: %d/%s' % (id,strand)
          print gt_tuple_pos[:10],gt_tuple_pos[-10:]
          print don_pos[:10],don_pos[-10:]
          pdb.set_trace()
          print gt_tuple_pos[:10],gt_tuple_pos[-10:]
          print don_pos[:10],don_pos[-10:]
          pdb.set_trace()
index 4c8a65b..39e2b0c 100644 (file)
--- a/test.conf
+++ b/test.conf
@@ -38,12 +38,6 @@ unspliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map.vm
 
 num_splits = 5
 
 
 num_splits = 5
 
-#
-# Specifiy here the filename of the global logfile QPalma uses to report everything
-#
-
-global_log_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/first_test/qpalma.log
-
 #
 # The parameter you want to do prediction with:
 #
 #
 # The parameter you want to do prediction with:
 #
index 350e5a7..5f62f40 100644 (file)
@@ -24,14 +24,17 @@ from qpalma.utils import print_prediction
 
 from qpalma.Run import Run
 
 
 from qpalma.Run import Run
 
+from qpalma.SettingsParser import parseSettings
 from qpalma.OutputFormat import alignment_reconstruct
 from qpalma.OutputFormat import alignment_reconstruct
-
 from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_complement
 
 from qpalma.sequence_utils import DataAccessWrapper,SeqSpliceInfo,reverse_complement
 
+from qpalma.Lookup import LookupTable
+
+
 jp = os.path.join
 
 
 jp = os.path.join
 
 
-def createScoresForSequence(full_chromo_seq):
+def createScoresForSequence(full_chromo_seq,reverse_strand=False):
    """
    Given a genomic sequence this function calculates random scores for all
    ocurring splice sites.
    """
    Given a genomic sequence this function calculates random scores for all
    ocurring splice sites.
@@ -40,6 +43,8 @@ def createScoresForSequence(full_chromo_seq):
    acc_pos = []
    don_pos = []
 
    acc_pos = []
    don_pos = []
 
+   total_size = len(full_chromo_seq)
+
    # the first/last 205 pos do not have any predictions
    for pos,elem in enumerate(full_chromo_seq[205:-205]):
       if full_chromo_seq[pos-2:pos] == 'ag':
    # the first/last 205 pos do not have any predictions
    for pos,elem in enumerate(full_chromo_seq[205:-205]):
       if full_chromo_seq[pos-2:pos] == 'ag':
@@ -60,6 +65,17 @@ def createScoresForSequence(full_chromo_seq):
    for idx in range(len(don_pos)):
       don_scores[idx] = random.uniform(0.1,1.0)
 
    for idx in range(len(don_pos)):
       don_scores[idx] = random.uniform(0.1,1.0)
 
+   # recalculate indices and reverse them in order to have positions relative
+   # to positive strand
+   if reverse_strand:
+      acc_pos = [total_size-1-e for e in acc_pos]
+      acc_pos.reverse()
+      acc_scores.reverse()
+
+      don_pos = [total_size-1-e for e in don_pos]
+      don_pos.reverse()
+      don_scores.reverse()
+
    acc_pos = array.array('I',acc_pos)
    acc_scores = array.array('f',acc_scores)
 
    acc_pos = array.array('I',acc_pos)
    acc_scores = array.array('f',acc_scores)
 
@@ -85,7 +101,6 @@ def saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,strand):
    don_pos.tofile(open(don_pos_fn, 'wb'))
 
 
    don_pos.tofile(open(don_pos_fn, 'wb'))
 
 
-
 def create_mini_chromosome():
    
    chromo_fn   = 'test_data/chromo1.flat'
 def create_mini_chromosome():
    
    chromo_fn   = 'test_data/chromo1.flat'
@@ -94,23 +109,47 @@ def create_mini_chromosome():
    full_chromo_seq = chromo_fh.read()
    full_chromo_seq = full_chromo_seq.strip()
 
    full_chromo_seq = chromo_fh.read()
    full_chromo_seq = full_chromo_seq.strip()
 
+   print full_chromo_seq[:200]
+
    # create data for forward strand
    # create data for forward strand
-   acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq)
+   acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq,reverse_strand=False)
+   print acc_pos[:5]
+   print don_pos[:5]
    saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'+')
 
    # create data for reverse strand
    full_chromo_seq_rev = reverse_complement(full_chromo_seq)
    saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'+')
 
    # create data for reverse strand
    full_chromo_seq_rev = reverse_complement(full_chromo_seq)
-   acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev)
 
 
-   # positions are always stored relative to positive strand
-   #acc_scores.reverse()
-   #acc_scores.
+   total_size = len(full_chromo_seq_rev)
+
+   print full_chromo_seq_rev[:200]
+   acc_pos,acc_scores,don_pos,don_scores = createScoresForSequence(full_chromo_seq_rev,reverse_strand=True)
+   acc_pos = [total_size-1-e for e in acc_pos]
+   acc_pos.reverse()
+   print acc_pos[:5]
+   don_pos = [total_size-1-e for e in don_pos]
+   don_pos.reverse()
+   print don_pos[:5]
 
 
-   don_scores.reverse()
+   #
+   # Remember: The positions are always saved one-based.
+   #
 
 
-   saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
+   #saveSpliceInfo(acc_pos,acc_scores,don_pos,don_scores,'-')
 
 
+   a = 103
+   b = 255
+   print 'pos: %s'%full_chromo_seq[a:b]
+   print 'neg: %s'%full_chromo_seq_rev[a:b]
 
 
+   total_size = len(full_chromo_seq)
+   ap = total_size-b
+   bp = total_size-a
+   print 'png: %s'%reverse_complement(full_chromo_seq[ap:bp])
+
+
+def test_rev_comp():
+   get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False,perform_checks=True)
 
 class TestQPalmaPrediction(unittest.TestCase):
    """
 
 class TestQPalmaPrediction(unittest.TestCase):
    """
@@ -214,8 +253,47 @@ class TestQPalmaPrediction(unittest.TestCase):
       example = (currentSeqInfo,read,currentQualities)
       self.prediction_set[id] = [example]
 
       example = (currentSeqInfo,read,currentQualities)
       self.prediction_set[id] = [example]
 
-   
+
    def testAlignments(self):
    def testAlignments(self):
+
+      settings = parseSettings('testcase.conf')
+
+      print self.prediction_set
+      for example_key in self.prediction_set.keys():
+         print 'Current example %d' % example_key
+
+         for example in self.prediction_set[example_key]:
+            print example
+            print 'size'
+            print len(example)
+
+      accessWrapper = DataAccessWrapper(settings)
+      seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+
+      qp = QPalma(seqInfo,True)
+      allPredictions = qp.predict(self.prediction_set,settings)
+
+      for current_prediction in allPredictions:
+         align_str = print_prediction(current_prediction)
+         print align_str
+
+         id = current_prediction['id']
+         seq         = current_prediction['read']
+         dna         = current_prediction['dna']
+         chromo      = current_prediction['chr']
+         strand      = current_prediction['strand']
+         start_pos   = current_prediction['start_pos']
+         predExons   = current_prediction['predExons']
+
+         numExons = int(math.ceil(len(predExons) / 2))
+         
+         print alignment_reconstruct(current_prediction,numExons)
+         print id,start_pos,predExons
+
+      print 'Problem counter is %d' % qp.problem_ctr 
+
+
+   def _testAlignments(self):
       run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
 
       run   = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
       run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
 
       run   = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
@@ -273,7 +351,73 @@ class TestQPalmaPrediction(unittest.TestCase):
       print 'Problem counter is %d' % qp.problem_ctr 
 
 
       print 'Problem counter is %d' % qp.problem_ctr 
 
 
+def simple_check():
+   # fetch the data needed
+   settings = {}
+
+   #settings['genome_dir']           = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
+   #settings['acceptor_scores_loc']  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+   #settings['donor_scores_loc']     = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+   #settings['genome_file_fmt']      = 'chr%d.dna.flat'
+   #settings['splice_score_file_fmt']= 'contig_%d%s'
+
+   #allowed_fragments = [1]
+   #settings['allowed_fragments'] = allowed_fragments
+
+   settings['genome_dir']           = 'test_data/'
+   settings['acceptor_scores_loc']  = 'test_data/acc'
+   settings['donor_scores_loc']     = 'test_data/don'
+
+   settings['genome_file_fmt']      = 'chromo%d.flat'
+   settings['splice_score_file_fmt']= 'chromo_%d%s'
+
+   allowed_fragments = [1]
+   settings['allowed_fragments'] = allowed_fragments
+
+   accessWrapper = DataAccessWrapper(settings)
+   seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
+
+   b = 0
+   e = 10000
+   seq = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=True,perform_checks=False)
+
+   total_size = seqInfo.getFragmentSize(1)
+   bp = total_size - e
+   ep = total_size - b
+   seqp = seqInfo.get_seq_and_scores(1,'+',b,e,only_seq=True,perform_checks=False)
+   seqp = reverse_complement(seqp)
+
+   print seq[0:100] == seqp[0:100]
+   print seq[534:1652] == seqp[534:1652]
+   print seq[-100:] == seqp[-100:]
+
+   lt1 = LookupTable(settings)
+   dna,acc,don= lt1.get_seq_and_scores(1,'-',b,e)
+
+   print seq[:100]  == seqp[:100]  == dna[:100]
+   print seq[-100:] == seqp[-100:] == dna[-100:]
+
+   b = 206
+   e = 300
+   dna,acc,don = seqInfo.get_seq_and_scores(1,'-',b,e,only_seq=False,perform_checks=False)
+   
+   lt1 = LookupTable(settings)
+   _dna,_acc,_don= lt1.get_seq_and_scores(1,'-',b,e)
+
+
+   print dna == _dna
+   print acc == _acc
+   print don == _don
+
+   from numpy import inf
+   print [p for p,e in enumerate(acc) if e != -inf]
+   print [p for p,e in enumerate(_acc) if e != -inf]
+
+
+
 if __name__ == '__main__':
 if __name__ == '__main__':
-   create_mini_chromosome()
+   #create_mini_chromosome()
+   simple_check()
    #suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
    #unittest.TextTestRunner(verbosity=2).run(suite)
    #suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
    #unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/tests/test_qpalma_installation.py b/tests/test_qpalma_installation.py
new file mode 100644 (file)
index 0000000..6a3af01
--- /dev/null
@@ -0,0 +1,92 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
+#
+# This file performs a lot of checks in order to verify the installation of
+# QPalma
+#
+
+import os
+import sys
+import unittest
+
+from test_approximation import TestApproximation
+from test_qpalma import TestQPalmaPrediction
+from test_utils import TestSequenceUtils,TestLookupTable
+
+SUCCESS = True
+
+def check_for_module(module_name):
+
+   try:
+      exec("import %s"%module_name)
+   except:
+      return False
+      sys.exc_info()
+
+   return True
+
+
+if __name__ == '__main__':
+   log_fn = 'error.log'
+
+   if os.path.exists(log_fn):
+      os.remove(log_fn)
+
+   out_fh = open(log_fn,'w+')
+
+   print '\nChecking if your installation was successful...'
+
+   print '\nChecking for needed modules...\n'
+
+
+   module_list = ['numpy','DRMAA','Genefinding','pythongrid','QPalmaDP','qpalma']
+
+   for mod in module_list:
+      status = check_for_module(mod)
+      if status:
+         mes = "Successful!"
+      else:
+         mes = "Failed!"
+         SUCCESS = False
+
+      line = 'Status of module %s:\t%s'%(mod,mes)
+      print line
+      out_fh.write(line+'\n')
+
+   print '\nChecking for optional modules...\n'
+
+   module_list = ['pycplex', 'cvxopt', 'pymosek']
+
+   for mod in module_list:
+      line = 'Status of module %s:\t%s'%(mod,str(check_for_module(mod)))
+      print line
+      out_fh.write(line+'\n')
+
+   # after checking the modules we run some simple testcases on QPalma.
+   data_suite = unittest.TestLoader().loadTestsFromTestCase(TestLookupTable)
+   approximation_suite = unittest.TestLoader().loadTestsFromTestCase(TestApproximation)
+   prediction_suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
+
+   all_suites = unittest.TestSuite([data_suite, approximation_suite, prediction_suite])
+   unittest.TextTestRunner(verbosity=2).run(all_suites)
+
+   if SUCCESS:
+      print '\n\n--- All checks where successful!! ---\n\n'
+   else:
+      print '\n\n--- Send the file error.log to qpalma@tuebingen.mpg.de ---\n\n'
+      env = os.environ
+      lines = ''
+      for var in ['PATH','LD_LIBRARY_PATH','PYTHONPATH']:
+         lines += '%s=%s\n'%(var,env[var])
+      out_fh.write(lines)
+
+   out_fh.close()