+ fixed some bugs in the negative strand lookup table
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 5 Jun 2008 15:24:51 +0000 (15:24 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 5 Jun 2008 15:24:51 +0000 (15:24 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9406 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/sequence_utils.py
scripts/Lookup.py

index 5e00cee..46435e1 100644 (file)
@@ -6,12 +6,22 @@ import pdb
 import random
 import re
 import sys
+import subprocess
 
 from numpy.matlib import inf
 
 from Genefinding import *
 from genome_utils import load_genomic
 
+def get_flatfile_size(filename):
+   cmd =  'wc -c %s | cut -f1 -d \' \'' % filename
+   obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+   out,err = obj.communicate()
+   
+   if err != '':
+      print 'Error occurred while trying to obtain file size'
+   return int(out)
+
 
 def reverse_complement(seq):
    """
@@ -172,14 +182,7 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
    if strand == '-':
       fn = 'chr%d.dna.flat' % chr
       filename = os.path.join(dna_flat_files,fn)
-      cmd =  'wc -c %s | cut -f1 -d \' \'' % filename
-      import subprocess
-      obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-      out,err = obj.communicate()
-      
-      if err != '':
-         print 'Error occurred while trying to obtain file size'
-      end = int(out)
+      end = get_flatfile_size(filename)
 
       intervalBegin = genomicSeq_start-100
       intervalEnd    = genomicSeq_stop+100
index a3ccdd6..fd68fe9 100644 (file)
@@ -9,12 +9,7 @@ import pdb
 import random
 
 from numpy.matlib import inf
-
-from Genefinding import *
-from genome_utils import load_genomic
-
-# only for checking purposes
-from qpalma.sequence_utils import  get_seq_and_scores
+from qpalma.sequence_utils import get_seq_and_scores,get_flatfile_size
 
 
 class Lookup:
@@ -23,14 +18,11 @@ class Lookup:
    and the scores once for all chromosomes.
    """
 
-
    def __init__(self,dna_flat_files):
       self.dna_flat_files = dna_flat_files
 
-      self.max_size = 40000000
-
       self.chromosomes = range(1,6)
-      self.strands = ['-']
+      self.strands = ['+','-']
 
       self.genomicSequencesP   = [0]*(len(self.chromosomes)+1)
       self.acceptorScoresP     = [0]*(len(self.chromosomes)+1)
@@ -41,30 +33,27 @@ class Lookup:
       self.donorScoresN        = [0]*(len(self.chromosomes)+1)
      
       #for chrId in self.chromosomes:
-      for chrId in range(4,6):
+      for chrId in range(1,6):
          for strandId in self.strands:
             print (chrId,strandId)
             self.prefetch_seq_and_scores(chrId,strandId)
 
  
-   def get_seq_and_scores(self,chr,strand,start,stop,dna_flat_files):
-      assert chr in self.chromosomes
+   def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
+      assert chromo in self.chromosomes
       assert strand in self.strands
 
       if strand == '+':
-         currentSequence         = self.genomicSequencesP[chr][start-1:stop]
-         length = len(currentSequence)
-         currentDonorScores     = self.donorScoresP[chr][start:stop]
-         currentDonorScores     += [-inf]*(length-len(currentDonorScores))
-         currentAcceptorScores  = self.acceptorScoresP[chr][start:stop+2]
-         currentAcceptorScores  = currentAcceptorScores[1:]
+         currentSequence         = self.genomicSequencesP[chromo][start:stop+1]
+         currentDonorScores     = self.donorScoresP[chromo][start:stop+1]
+         currentAcceptorScores  = self.acceptorScoresP[chromo][start:stop+1]
       elif strand == '-':
-         currentSequence         = self.genomicSequencesN[chr][start-1:stop]
-         length = len(currentSequence)
-         currentDonorScores     = self.donorScoresN[chr][start:stop]
-         currentDonorScores     += [-inf]*(length-len(currentDonorScores))
-         currentAcceptorScores  = self.acceptorScoresN[chr][start:stop+2]
-         currentAcceptorScores  = currentAcceptorScores[1:]
+         end = len(self.genomicSequencesN[chromo])
+         currentSequence         = self.genomicSequencesN[chromo][end-stop-1:end-start]
+         #length = len(currentSequence)
+         currentDonorScores     = self.donorScoresN[chromo][end-stop-1:end-start]
+         #currentDonorScores     += [-inf]*(length-len(currentDonorScores))
+         currentAcceptorScores  = self.acceptorScoresN[chromo][end-stop-1:end-start]
       else:
          pdb.set_trace()
 
@@ -79,14 +68,10 @@ class Lookup:
 
       fn = 'chr%d.dna.flat' % chromo
       filename = os.path.join(self.dna_flat_files,fn)
-      cmd =  'wc -c %s | cut -f1 -d \' \'' % filename
-      import subprocess
-      obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-      out,err = obj.communicate()
-      if err != '':
-         print 'Error occurred while trying to obtain file size'
-      size = int(out)
+      size = get_flatfile_size(filename)
 
+      #  we do not use the first/last 205 position as they are can not be
+      #  predicted due to the window size used for splice site prediction
       genomicSeq_start  = 205
       genomicSeq_stop   = size-205
       genomicSeq,currentAcc,currentDon = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files)
@@ -99,49 +84,10 @@ class Lookup:
          self.genomicSequencesP[chromo] = U + genomicSeq + U
          self.acceptorScoresP[chromo]   = INF_F + currentAcc + INF_F
          self.donorScoresP[chromo]      = INF_F + currentDon + INF_F
-
       elif strand == '-':
          self.genomicSequencesN[chromo] = U + genomicSeq + U
          self.acceptorScoresN[chromo]   = INF_F + currentAcc + INF_F
          self.donorScoresN[chromo]      = INF_F + currentDon + INF_F
-
-      else:
-         assert False
-
-
-   def _prefetch_seq_and_scores(self,chr,strand):
-      """
-      This function expects an interval, chromosome and strand information and
-      returns then the genomic sequence of this interval and the associated scores.
-      """
-
-      chrom         = 'chr%d' % chr
-
-      genomicSeq_start  = 0
-      genomicSeq_stop   = self.max_size
-      genomicSeq = load_genomic(chrom,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files,one_based=False)
-      genomicSeq = genomicSeq.lower()
-
-      no_base = re.compile('[^acgt]')
-      genomicSeq = no_base.sub('n',genomicSeq)
-      # check the obtained dna sequence
-      assert genomicSeq != '', 'load_genomic returned empty sequence!'
-
-      intervalBegin  = 0
-      intervalEnd    = self.max_size
-
-      currentAcc, currentDon = self.getSpliceScores(chr,strand,intervalBegin,intervalEnd)
-
-      if strand == '+':
-         self.genomicSequencesP[chr] = genomicSeq
-         self.acceptorScoresP[chr]   = currentAcc
-         self.donorScoresP[chr]      = currentDon
-
-      elif strand == '-':
-         self.genomicSequencesN[chr] = genomicSeq
-         self.acceptorScoresN[chr]   = currentAcc
-         self.donorScoresN[chr]      = currentDon
-
       else:
          assert False
 
@@ -177,27 +123,3 @@ class Lookup:
 if __name__ == '__main__':
    dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
    lt1 = Lookup(dna_flat_files)
-
-   #seq,acc,don =  lt1.get_seq_and_scores(1,'+',1001,2002,'')
-   #_seq,_acc,_don = get_seq_and_scores(1,'+',1001,2002,dna_flat_files)
-
-   for chro in range(1,6):
-      start = random.randint(40,10000)
-      stop  = start + random.randint(10,100) 
-
-      seq,acc,don    =  lt1.get_seq_and_scores(chro,'-',start,stop,'')
-      _seq,_acc,_don = get_seq_and_scores(chro,'-',start,stop,dna_flat_files)
-
-      assert _seq == seq, pdb.set_trace()
-      assert _acc == acc
-      assert _don == don
-
-      #seq,acc,don    =  lt1.get_seq_and_scores(chro,'-',start,stop,'')
-      #_seq,_acc,_don = get_seq_and_scores(chro,'-',start,stop,dna_flat_files)
-
-      #assert _seq == seq
-      #assert _acc == acc
-      #assert _don == don
-
-   pdb.set_trace()
-