+ adapted Lookup table to new data interface
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 29 Jul 2008 15:20:38 +0000 (15:20 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 29 Jul 2008 15:20:38 +0000 (15:20 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@10219 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Lookup.py

index f1bc39a..4c29143 100644 (file)
@@ -9,48 +9,58 @@ import pdb
 import random
 
 from numpy.matlib import inf
 import random
 
 from numpy.matlib import inf
-from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size
+from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper
 
 
 
 
-class Lookup:
+class LookupTable:
    """
    Speed up genomic and splice site score queries by prefetching the sequences
    and the scores once for all chromosomes.
    """
 
    """
    Speed up genomic and splice site score queries by prefetching the sequences
    and the scores once for all chromosomes.
    """
 
-   def __init__(self,dna_flat_files):
-      self.dna_flat_files = dna_flat_files
-
-      self.chromosomes = range(1,6)
-      self.strands = ['+','-']
-
-      flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      chromo_list = [1,2,3,4,5,8,9,10,11,12]
+   def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt,chromo_list):
+      accessWrapper = DataAccessWrapper(genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt)
       self.seqInfo = SeqSpliceInfo(flat_files,chromo_list)
 
       self.seqInfo = SeqSpliceInfo(flat_files,chromo_list)
 
-      self.genomicSequencesP   = [0]*(len(self.chromosomes)+1)
-      self.acceptorScoresP     = [0]*(len(self.chromosomes)+1)
-      self.donorScoresP        = [0]*(len(self.chromosomes)+1)
-
-      self.genomicSequencesN   = [0]*(len(self.chromosomes)+1)
-      self.acceptorScoresN     = [0]*(len(self.chromosomes)+1)
-      self.donorScoresN        = [0]*(len(self.chromosomes)+1)
+      self.strands = ['+','-']
+   
+      # take care that it also works say if we get a chromo_list [1,3,5]
+      # those are mapped then to 0,1,2
+      self.chromo_list = chromo_list
+      self.chromo_map = []
+      for elem in chromo_list:
+         self.chromo_map.append(elem)
+
+      self.genomicSequencesP   = [0]*(len(self.chromo_map)+1)
+      self.acceptorScoresP     = [0]*(len(self.chromo_map)+1)
+      self.donorScoresP        = [0]*(len(self.chromo_map)+1)
+
+      self.genomicSequencesN   = [0]*(len(self.chromo_map)+1)
+      self.acceptorScoresN     = [0]*(len(self.chromo_map)+1)
+      self.donorScoresN        = [0]*(len(self.chromo_map)+1)
      
      
-      #for chrId in self.chromosomes:
-      #for chrId in range(1,6):
-      for chrId in range(1,2):
+      for chrId in self.chromo_list:
          for strandId in self.strands:
             print (chrId,strandId)
             self.prefetch_seq_and_scores(chrId,strandId)
 
  
    def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
          for strandId in self.strands:
             print (chrId,strandId)
             self.prefetch_seq_and_scores(chrId,strandId)
 
  
    def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
-      assert chromo in self.chromosomes
+      assert chromo in self.chromo_list
       assert strand in self.strands
 
       assert strand in self.strands
 
-      currentSequence         = self.genomicSequencesP[chromo][start:stop]
-      currentDonorScores      = self.donorScoresP[chromo][start:stop]
-      currentAcceptorScores   = self.acceptorScoresP[chromo][start:stop]
+      chromo_idx = self.chromo_map[chromo]
+
+      if strand == '+':
+         currentSequence         = self.genomicSequencesP[chromo_idx][start:stop]
+         currentDonorScores      = self.donorScoresP[chromo_idx][start:stop]
+         currentAcceptorScores   = self.acceptorScoresP[chromo_idx][start:stop]
+      elif strand == '-':
+         currentSequence         = self.genomicSequencesN[chromo_idx][start:stop]
+         currentDonorScores      = self.donorScoresN[chromo_idx][start:stop]
+         currentAcceptorScores   = self.acceptorScoresN[chromo_idx][start:stop]
+      else:
+         assert False
 
       return currentSequence, currentAcceptorScores, currentDonorScores
 
 
       return currentSequence, currentAcceptorScores, currentDonorScores
 
@@ -61,30 +71,20 @@ class Lookup:
       returns then the genomic sequence of this interval and the associated scores.
       """
 
       returns then the genomic sequence of this interval and the associated scores.
       """
 
-      genomicSeq_start  = 0
+      chromo_idx = self.chromo_map[chromo]
 
 
-      if strand == '-':
-         genomicSeq_stop   = self.seqInfo.chromo_sizes[chromo+7]
-      else:
-         genomicSeq_stop   = self.seqInfo.chromo_sizes[chromo]
+      genomicSeq_start  = 0
 
       genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
       genomicSeq = genomicSeq.lower()
 
       if strand == '+':
 
       genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
       genomicSeq = genomicSeq.lower()
 
       if strand == '+':
-         self.genomicSequencesP[chromo] = genomicSeq
-         self.acceptorScoresP[chromo]   = currentAcc
-         self.donorScoresP[chromo]      = currentDon
+         self.genomicSequencesP[chromo_idx] = genomicSeq
+         self.acceptorScoresP[chromo_idx]   = currentAcc
+         self.donorScoresP[chromo_idx]      = currentDon
       elif strand == '-':
       elif strand == '-':
-         self.genomicSequencesN[chromo] = genomicSeq
-         self.acceptorScoresN[chromo]   = currentAcc
-         self.donorScoresN[chromo]      = currentDon
+         self.genomicSequencesN[chromo_idx] = genomicSeq
+         self.acceptorScoresN[chromo_idx]   = currentAcc
+         self.donorScoresN[chromo_idx]      = currentDon
       else:
          assert False
       else:
          assert False
-
-
-if __name__ == '__main__':
-   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-   lt1 = Lookup(dna_flat_files)
-
-   #self.lt1.seqInfo.get_seq_and_scores