+ adapted Lookup table to new data interface
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 29 Jul 2008 15:20:38 +0000 (15:20 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 29 Jul 2008 15:20:38 +0000 (15:20 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@10219 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Lookup.py

index f1bc39a..4c29143 100644 (file)
@@ -9,48 +9,58 @@ import pdb
 import random
 
 from numpy.matlib import inf
-from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size
+from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper
 
 
-class Lookup:
+class LookupTable:
    """
    Speed up genomic and splice site score queries by prefetching the sequences
    and the scores once for all chromosomes.
    """
 
-   def __init__(self,dna_flat_files):
-      self.dna_flat_files = dna_flat_files
-
-      self.chromosomes = range(1,6)
-      self.strands = ['+','-']
-
-      flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      chromo_list = [1,2,3,4,5,8,9,10,11,12]
+   def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt,chromo_list):
+      accessWrapper = DataAccessWrapper(genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt)
       self.seqInfo = SeqSpliceInfo(flat_files,chromo_list)
 
-      self.genomicSequencesP   = [0]*(len(self.chromosomes)+1)
-      self.acceptorScoresP     = [0]*(len(self.chromosomes)+1)
-      self.donorScoresP        = [0]*(len(self.chromosomes)+1)
-
-      self.genomicSequencesN   = [0]*(len(self.chromosomes)+1)
-      self.acceptorScoresN     = [0]*(len(self.chromosomes)+1)
-      self.donorScoresN        = [0]*(len(self.chromosomes)+1)
+      self.strands = ['+','-']
+   
+      # take care that it also works say if we get a chromo_list [1,3,5]
+      # those are mapped then to 0,1,2
+      self.chromo_list = chromo_list
+      self.chromo_map = []
+      for elem in chromo_list:
+         self.chromo_map.append(elem)
+
+      self.genomicSequencesP   = [0]*(len(self.chromo_map)+1)
+      self.acceptorScoresP     = [0]*(len(self.chromo_map)+1)
+      self.donorScoresP        = [0]*(len(self.chromo_map)+1)
+
+      self.genomicSequencesN   = [0]*(len(self.chromo_map)+1)
+      self.acceptorScoresN     = [0]*(len(self.chromo_map)+1)
+      self.donorScoresN        = [0]*(len(self.chromo_map)+1)
      
-      #for chrId in self.chromosomes:
-      #for chrId in range(1,6):
-      for chrId in range(1,2):
+      for chrId in self.chromo_list:
          for strandId in self.strands:
             print (chrId,strandId)
             self.prefetch_seq_and_scores(chrId,strandId)
 
  
    def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
-      assert chromo in self.chromosomes
+      assert chromo in self.chromo_list
       assert strand in self.strands
 
-      currentSequence         = self.genomicSequencesP[chromo][start:stop]
-      currentDonorScores      = self.donorScoresP[chromo][start:stop]
-      currentAcceptorScores   = self.acceptorScoresP[chromo][start:stop]
+      chromo_idx = self.chromo_map[chromo]
+
+      if strand == '+':
+         currentSequence         = self.genomicSequencesP[chromo_idx][start:stop]
+         currentDonorScores      = self.donorScoresP[chromo_idx][start:stop]
+         currentAcceptorScores   = self.acceptorScoresP[chromo_idx][start:stop]
+      elif strand == '-':
+         currentSequence         = self.genomicSequencesN[chromo_idx][start:stop]
+         currentDonorScores      = self.donorScoresN[chromo_idx][start:stop]
+         currentAcceptorScores   = self.acceptorScoresN[chromo_idx][start:stop]
+      else:
+         assert False
 
       return currentSequence, currentAcceptorScores, currentDonorScores
 
@@ -61,30 +71,20 @@ class Lookup:
       returns then the genomic sequence of this interval and the associated scores.
       """
 
-      genomicSeq_start  = 0
+      chromo_idx = self.chromo_map[chromo]
 
-      if strand == '-':
-         genomicSeq_stop   = self.seqInfo.chromo_sizes[chromo+7]
-      else:
-         genomicSeq_stop   = self.seqInfo.chromo_sizes[chromo]
+      genomicSeq_start  = 0
 
       genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
       genomicSeq = genomicSeq.lower()
 
       if strand == '+':
-         self.genomicSequencesP[chromo] = genomicSeq
-         self.acceptorScoresP[chromo]   = currentAcc
-         self.donorScoresP[chromo]      = currentDon
+         self.genomicSequencesP[chromo_idx] = genomicSeq
+         self.acceptorScoresP[chromo_idx]   = currentAcc
+         self.donorScoresP[chromo_idx]      = currentDon
       elif strand == '-':
-         self.genomicSequencesN[chromo] = genomicSeq
-         self.acceptorScoresN[chromo]   = currentAcc
-         self.donorScoresN[chromo]      = currentDon
+         self.genomicSequencesN[chromo_idx] = genomicSeq
+         self.acceptorScoresN[chromo_idx]   = currentAcc
+         self.donorScoresN[chromo_idx]      = currentDon
       else:
          assert False
-
-
-if __name__ == '__main__':
-   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-   lt1 = Lookup(dna_flat_files)
-
-   #self.lt1.seqInfo.get_seq_and_scores