+ added wrapper class for prefetching of genomic sequences and splice scores
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 12:15:49 +0000 (12:15 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 12:15:49 +0000 (12:15 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8674 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Lookup.py [new file with mode: 0644]

diff --git a/scripts/Lookup.py b/scripts/Lookup.py
new file mode 100644 (file)
index 0000000..0fabc39
--- /dev/null
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import random
+import os
+import re
+import pdb
+
+from Genefinding import *
+
+from genome_utils import load_genomic
+
+class Lookup:
+   """
+   Speed up genomic and splice site score queries
+   """
+
+
+   def __init__(self,dna_flat_files):
+      self.dna_flat_files = dna_flat_files
+
+      self.max_size = 40000000
+
+      self.chromosomes = range(1,6)
+
+      strands = ['+']
+
+      self.genomicSequences   = [0]*(len(self.chromosomes)+1)
+      self.acceptorScores     = [0]*(len(self.chromosomes)+1)
+      self.donorScores        = [0]*(len(self.chromosomes)+1)
+     
+      for chrId in self.chromosomes:
+         for strandId in strands:
+            print (chrId,strandId)
+            self.prefetch_seq_and_scores(chrId,strandId)
+
+   def get_seq_and_scores(self,chr,strand,start,stop,dna_flat_files):
+      assert chr in self.chromosomes
+
+      currentSequence         = self.genomicSequences[chr][start:stop]
+      currentDonors           = self.donorScores[chr][start:stop]
+      currentAcceptorScores   = self.acceptorScores[chr][start:stop]
+      currentAcceptorScores   = currentAcceptorScores[chr][start:stop]
+
+
+   def prefetch_seq_and_scores(self,chr,strand):
+      """
+      This function expects an interval, chromosome and strand information and
+      returns then the genomic sequence of this interval and the associated scores.
+      """
+
+      chrom         = 'chr%d' % chr
+
+      genomicSeq_start  = 0
+      genomicSeq_stop   = self.max_size
+      genomicSeq = load_genomic(chrom,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files,one_based=False)
+      genomicSeq = genomicSeq.lower()
+
+      no_base = re.compile('[^acgt]')
+      genomicSeq = no_base.sub('n',genomicSeq)
+      # check the obtained dna sequence
+      assert genomicSeq != '', 'load_genomic returned empty sequence!'
+
+      self.genomicSequences[chr] = genomicSeq
+
+      intervalBegin  = 0
+      intervalEnd    = self.max_size
+
+      currentAcc, currentDon = self.getSpliceScores(chr,strand,intervalBegin,intervalEnd)
+
+      self.acceptorScores[chr]   = currentAcc
+      self.donorScores[chr]      = currentDon
+
+
+   def getSpliceScores(self,chr,strand,intervalBegin,intervalEnd):
+      """
+      Now we want to use interval_query to get the predicted splice scores trained
+      on the TAIR sequence and annotation.
+      """
+      
+      assert intervalEnd > intervalBegin
+
+      size = intervalEnd-intervalBegin
+
+      acc = size*[0.0]
+      don = size*[0.0]
+
+      interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
+      pos_size = new_intp()
+      intp_assign(pos_size,1)
+
+      # fetch acceptor scores
+      sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
+      acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+
+      # fetch donor scores
+      sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
+      don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+
+      return acc, don
+
+if __name__ == '__main__':
+   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   lt1 = Lookup(dna_flat_files)
+   print lt1.get_seq_and_scores(1,'+',1001,2002,'')