+ changed sequence utils to support more general genomic/splice dirs and filenames
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 29 Jul 2008 15:02:41 +0000 (15:02 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 29 Jul 2008 15:02:41 +0000 (15:02 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@10218 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/sequence_utils.py

index 7003c7c..930811f 100644 (file)
@@ -13,6 +13,7 @@ import os.path
 import pdb
 import re
 import subprocess
+import numpy
 
 from numpy.matlib import inf
 
@@ -125,17 +126,87 @@ def create_bracket_seq(dna_seq,read_seq):
 
 
 
+def my_load_genomic(fname, strand, start, stop, one_based=True):
+    if (type(start)==numpy.ndarray) or (type(stop)==numpy.ndarray):
+        assert(len(start) == len(stop))
+        assert((start[1:]-stop[:-1]>0).all())
+        if strand == '+':
+            idx = xrange(len(start))
+        else:
+            idx = xrange(len(start)-1,-1,-1)
+
+        seq = ''.join([load_genomic(chromosome, strand, start[ix], stop[ix], genome)\
+                       for ix in idx])
+        return seq
+
+    f=file(fname)
+    if one_based:
+        f.seek(start-1)
+        str=f.read(stop-start+1)
+    else:
+        f.seek(start)
+        str=f.read(stop-start)
+
+    if strand=='-':
+        return reverse_complement(str)
+    elif strand=='+':
+        return str
+    else:
+        print 'strand must be + or -'
+        raise KeyError
+
+
+class DataAccessWrapper:
+   """
+   The purpose of this class is to wrap the genomic/splice site score data
+   access.
+
+   
+
+   """
+
+
+   def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt):
+      self.genomic_dir        = genomic_data_dir
+      self.acc_score_dir      = acc_score_dir
+      self.don_score_dir      = don_score_dir
+      self.genomic_fmt        = gen_fmt
+      self.sscore_fmt         = sscore_fmt
+
+      assert os.path.isdir(genomic_data_dir)
+      assert os.path.isdir(acc_score_dir)
+      assert os.path.isdir(don_score_dir)
+
+
+   def get_genomic_fragment_fn(self,id,strand):
+      if strand == '+':
+         return os.path.join(self.genomic_dir,self.genomic_fmt%id)
+      elif strand == '-':
+         return os.path.join(self.genomic_dir,(self.genomic_fmt%id)+'.neg')
+      else:
+         assert False
+
+
+   def get_splice_site_scores_fn(self,id,strand):
+      acc_fn = os.path.join(self.acc_score_dir,self.sscore_fmt%(id,strand))
+      don_fn = os.path.join(self.don_score_dir,self.sscore_fmt%(id,strand))
+      return acc_fn,don_fn
+
+
+
 class SeqSpliceInfo():
+   """
+   
+   """
    
-   def __init__(self,dna_flat_files,chromo_list):
-      assert os.path.isdir(dna_flat_files)
-      self.flat_files = dna_flat_files
+   def __init__(self,accessWrapper,id_list):
+      self.accessWrapper = accessWrapper
       
       self.chromo_sizes = {}
    
-      print "Fetching chromosome sizes..."
-      for chromo in chromo_list:
-         filename = os.path.join(self.flat_files,'chr%d.dna.flat' % chromo)
+      print "Fetching fragment sizes..."
+      for chromo in id_list:
+         filename = self.accessWrapper.get_genomic_fragment_fn(chromo,'+')
          total_size = get_flat_file_size(filename)
          self.chromo_sizes[chromo] = total_size
 
@@ -160,20 +231,22 @@ class SeqSpliceInfo():
 
       total_size = self.chromo_sizes[chromo]
 
+      acc_fn, don_fn = self.accessWrapper.get_splice_site_scores_fn(chromo,strand)
+
       # fetch acceptor scores
-      sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
-      acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+      acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,acc_fn,total_size)
 
       # fetch donor scores
-      sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
-      don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+      don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,don_fn,total_size)
 
       return acc, don
 
 
    def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
-      chromo_string  = 'chr%d' % chromo
-      genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
+      #chromo_string  = 'chr%d' % chromo
+      #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
+      filename = self.accessWrapper.get_genomic_fragment_fn(chromo,strand)
+      genomicSeq = my_load_genomic(filename,'+',genomicSeq_start,genomicSeq_stop,one_based=False)
       genomicSeq = genomicSeq.strip().lower()
 
       # check the obtained dna sequence
@@ -197,10 +270,7 @@ class SeqSpliceInfo():
 
       assert genomicSeq_start < genomicSeq_stop
 
-      if strand == '-':
-         genomicSeq = self.fetch_sequence(chromo+7,'+',genomicSeq_start,genomicSeq_stop)
-      else:
-         genomicSeq = self.fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop)
+      genomicSeq = self.fetch_sequence(chromo,strand,genomicSeq_start,genomicSeq_stop)
 
       if only_seq:
          return genomicSeq
@@ -235,9 +305,9 @@ class SeqSpliceInfo():
       currentAcc = currentAcc[:len(genomicSeq)]
       currentDon = currentDon[:len(genomicSeq)]
 
-      #length = len(genomicSeq)
-      #currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
-      #currentDon = currentDon+[-inf]*(length-len(currentDon))
+      length = len(genomicSeq)
+      currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
+      currentDon = currentDon+[-inf]*(length-len(currentDon))
       currentDon[-1] = -inf
       
       gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if (p>0 and p<len(genomicSeq)-1 and e=='g' ) and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
@@ -267,13 +337,13 @@ class SeqSpliceInfo():
          print 'ACC: Chromo/strand: %d/%s' % (chromo,strand)
          print ag_tuple_pos[:10],ag_tuple_pos[-10:]
          print acc_pos[:10],acc_pos[-10:]
-         #pdb.set_trace()
+         pdb.set_trace()
 
       if not gt_tuple_pos == don_pos:
          print 'DON: Chromo/strand: %d/%s' % (chromo,strand)
          print gt_tuple_pos[:10],gt_tuple_pos[-10:]
          print don_pos[:10],don_pos[-10:]
-         #pdb.set_trace()
+         pdb.set_trace()
 
       #assert ag_tuple_pos == acc_pos, pdb.set_trace()
       #assert gt_tuple_pos == don_pos, pdb.set_trace()