+ Lookup table checked for pos. strand
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 12:56:36 +0000 (12:56 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 12:56:36 +0000 (12:56 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8677 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Lookup.py

index 0fabc39..a0673ab 100644 (file)
@@ -6,14 +6,21 @@ import random
 import os
 import re
 import pdb
+import random
 
-from Genefinding import *
+from numpy.matlib import inf
 
+from Genefinding import *
 from genome_utils import load_genomic
 
+# only for checking purposes
+from compile_dataset import  get_seq_and_scores
+
+
 class Lookup:
    """
-   Speed up genomic and splice site score queries
+   Speed up genomic and splice site score queries by prefetching the sequences
+   and the scores once for all chromosomes.
    """
 
 
@@ -23,12 +30,17 @@ class Lookup:
       self.max_size = 40000000
 
       self.chromosomes = range(1,6)
+      #self.chromosomes = [1]
 
-      strands = ['+']
+      strands = ['+','-']
 
-      self.genomicSequences   = [0]*(len(self.chromosomes)+1)
-      self.acceptorScores     = [0]*(len(self.chromosomes)+1)
-      self.donorScores        = [0]*(len(self.chromosomes)+1)
+      self.genomicSequencesP   = [0]*(len(self.chromosomes)+1)
+      self.acceptorScoresP     = [0]*(len(self.chromosomes)+1)
+      self.donorScoresP        = [0]*(len(self.chromosomes)+1)
+
+      self.genomicSequencesN   = [0]*(len(self.chromosomes)+1)
+      self.acceptorScoresN     = [0]*(len(self.chromosomes)+1)
+      self.donorScoresN        = [0]*(len(self.chromosomes)+1)
      
       for chrId in self.chromosomes:
          for strandId in strands:
@@ -39,10 +51,23 @@ class Lookup:
    def get_seq_and_scores(self,chr,strand,start,stop,dna_flat_files):
       assert chr in self.chromosomes
 
-      currentSequence         = self.genomicSequences[chr][start:stop]
-      currentDonors           = self.donorScores[chr][start:stop]
-      currentAcceptorScores   = self.acceptorScores[chr][start:stop]
-      currentAcceptorScores   = currentAcceptorScores[chr][start:stop]
+      if strand == '+'
+         currentSequence         = self.genomicSequencesP[chr][start-1:stop]
+         length = len(currentSequence)
+         currentDonorScores      = self.donorScoresP[chr][start:stop]
+         currentDonorScores      += [-inf]*(length-len(currentDonorScores))
+         currentAcceptorScores   = self.acceptorScoresP[chr][start:stop+2]
+         currentAcceptorScores   = currentAcceptorScoresP[1:]
+      elif:
+         currentSequence         = self.genomicSequencesN[chr][start-1:stop]
+         length = len(currentSequence)
+         currentDonorScores      = self.donorScoresN[chr][start:stop]
+         currentDonorScores      += [-inf]*(length-len(currentDonorScores))
+         currentAcceptorScores   = self.acceptorScoresN[chr][start:stop+2]
+         currentAcceptorScores   = currentAcceptorScoresN[1:]
+
+
+      return currentSequence, currentAcceptorScores, currentDonorScores
 
 
    def prefetch_seq_and_scores(self,chr,strand):
@@ -63,15 +88,23 @@ class Lookup:
       # check the obtained dna sequence
       assert genomicSeq != '', 'load_genomic returned empty sequence!'
 
-      self.genomicSequences[chr] = genomicSeq
-
       intervalBegin  = 0
       intervalEnd    = self.max_size
 
       currentAcc, currentDon = self.getSpliceScores(chr,strand,intervalBegin,intervalEnd)
 
-      self.acceptorScores[chr]   = currentAcc
-      self.donorScores[chr]      = currentDon
+      if strand == '+'
+         self.genomicSequencesP[chr] = genomicSeq
+         self.acceptorScoresP[chr]   = currentAcc
+         self.donorScoresP[chr]      = currentDon
+
+      elif:
+         self.genomicSequencesN[chr] = genomicSeq
+         self.acceptorScoresN[chr]   = currentAcc
+         self.donorScoresN[chr]      = currentDon
+
+      else:
+         assert False
 
 
    def getSpliceScores(self,chr,strand,intervalBegin,intervalEnd):
@@ -101,7 +134,31 @@ class Lookup:
 
       return acc, don
 
+
 if __name__ == '__main__':
    dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
    lt1 = Lookup(dna_flat_files)
-   print lt1.get_seq_and_scores(1,'+',1001,2002,'')
+
+   #seq,acc,don =  lt1.get_seq_and_scores(1,'+',1001,2002,'')
+   #_seq,_acc,_don = get_seq_and_scores(1,'+',1001,2002,dna_flat_files)
+
+   for chro in range(1,6):
+      start = random.randint(40,10000)
+      stop  = start + random.randint(10,100) 
+
+      seq,acc,don    =  lt1.get_seq_and_scores(chro,'+',start,stop,'')
+      _seq,_acc,_don = get_seq_and_scores(chro,'+',start,stop,dna_flat_files)
+
+      assert _seq == seq
+      assert _acc == acc
+      assert _don == don
+
+      #seq,acc,don    =  lt1.get_seq_and_scores(chro,'-',start,stop,'')
+      #_seq,_acc,_don = get_seq_and_scores(chro,'-',start,stop,dna_flat_files)
+
+      #assert _seq == seq
+      #assert _acc == acc
+      #assert _don == don
+
+
+   pdb.set_trace()