+ some bugfixes
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 9 Jul 2008 14:39:16 +0000 (14:39 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 9 Jul 2008 14:39:16 +0000 (14:39 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9939 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/sequence_utils.py
tests/.test_sequence_utils.py.swp [new file with mode: 0644]
tests/test_data/chr1.dna.flat [new file with mode: 0644]
tests/test_sequence_utils.py [new file with mode: 0644]

index 9a8e983..f82cd62 100644 (file)
@@ -21,6 +21,11 @@ from genome_utils import load_genomic
 extended_alphabet = ['-','a','c','g','t','n','[',']']
 alphabet          = ['-','a','c','g','t','n']
 
+
+class DataAccessionException:
+   pass
+
+
 def get_flatfile_size(filename):
    cmd =  'wc -c %s | cut -f1 -d \' \'' % filename
    obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
@@ -28,6 +33,8 @@ def get_flatfile_size(filename):
    
    if err != '':
       print 'Error occurred while trying to obtain file size'
+      raise DataAccessionException
+
    return int(out)
 
 
@@ -116,8 +123,7 @@ def create_bracket_seq(dna_seq,read_seq):
    return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
 
 
-
-def getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size=0):
+def getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size=0,genomicSeq=''):
    """
    Now we want to use interval_query to get the predicted splice scores trained
    on the TAIR sequence and annotation.
@@ -135,116 +141,102 @@ def getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size=0):
 
    # fetch acceptor scores
    sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
-   acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+   acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+
+   #ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g'][:100]
+   #acc_pos = [p for p,e in enumerate(acc) if e != -inf and p > 1][:100]
 
    # fetch donor scores
    sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
-   don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+   don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+
+   #gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+   #don_pos = [p for p,e in enumerate(don) if e != -inf and p > 1][:100]
+   #pdb.set_trace()
 
    return acc, don
 
 
-def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,only_seq=False):
+def fetch_sequence(chromo,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
+   chromo_string  = 'chr%d' % chromo
+   #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
+   #print 'loading %s...' % chromo_string 
+   genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,one_based=False)
+   genomicSeq = genomicSeq.strip().lower()
+
+   # check the obtained dna sequence
+   assert genomicSeq != '', 'load_genomic returned empty sequence!'
+   
+   # all entries other than a c g t are set to n
+   no_base = re.compile('[^acgt]')
+   genomicSeq = no_base.sub('n',genomicSeq)
+
+   return genomicSeq
+
+
+def get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,only_seq=False):
    """
    This function expects an interval, chromosome and strand information and
    returns then the genomic sequence of this interval and the associated scores.
    """
-
-   # because splice site scores are predicted using a window of the sequence the
+   # Because splice site scores are predicted using a window of the sequence the
    # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
    # do not have any score predictions
-   NO_SCORE_WINDOW_SIZE = 250
+   NO_SCORE_WINDOW_SIZE = 205
 
    assert genomicSeq_start < genomicSeq_stop
 
-   chrom         = 'chr%d' % chr
-   genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
-   genomicSeq = genomicSeq.lower()
-
-   # check the obtained dna sequence
-   assert genomicSeq != '', 'load_genomic returned empty sequence!'
-   
-   # all entries other than a c g t are set to n
-   no_base = re.compile('[^acgt]')
-   genomicSeq = no_base.sub('n',genomicSeq)
+   genomicSeq = fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop,dna_flat_files)
 
    if only_seq:
       return genomicSeq
 
-   fn = 'chr%d.dna.flat' % chr
-   filename = os.path.join(dna_flat_files,fn)
+   filename = os.path.join(dna_flat_files,'chr%d.dna.flat' % chromo)
    total_size = get_flatfile_size(filename)
 
-   intervalBegin = genomicSeq_start-100
-   intervalEnd    = genomicSeq_stop+100
-
-   # shorten the intervalEnd to the maximal size of the genomic sequence if it
-   # would exceed this
-   if intervalEnd > total_size:
-      intervalEnd = total_size-1
-
-   # we have assertions below checking whether every gt or ag has a splice score
-   # not equal to -inf if the gt or ag are within the first NO_SCORE_WINDOW_SIZE
-   # nucleotide then they have no scores at all
-   check_assertions = True
-   if intervalEnd > total_size - NO_SCORE_WINDOW_SIZE:
-      check_assertions = False
-      
-   if intervalBegin < NO_SCORE_WINDOW_SIZE:
-      check_assertions = False
+   # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
+   lookup_offset  = 0
+   intervalBegin  = max(genomicSeq_start-lookup_offset,0)
+   intervalEnd    = min(genomicSeq_stop+lookup_offset,total_size-1)
 
-   currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size)
-
-   currentAcc = currentAcc[100:-98]
-   currentAcc = currentAcc[1:]
-   currentDon = currentDon[100:-100]
+   if chromo > 7:
+      print intervalBegin,intervalEnd,total_size
+      currentAcc, currentDon = getSpliceScores(chromo-7,'-',intervalBegin,intervalEnd,total_size,genomicSeq)
+      currentDon = [-inf]+currentDon[:-1]
+   else:
+      currentAcc, currentDon = getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size,genomicSeq)
+      currentAcc = currentAcc[2:]
+      currentDon = currentDon[1:]
 
    length = len(genomicSeq)
+   currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
+   currentDon = currentDon+[-inf]*(length-len(currentDon))
+   
+   #acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
+   #don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
 
-   # indices for positive strand
-   if strand == '+':
-      currentAcc = currentAcc[:length]
-      currentDon = currentDon+[-inf]*(length-len(currentDon))
+   gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+   ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
 
-      ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
-      gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+   #pdb.set_trace()
+   #return genomicSeq, currentAcc, currentDon
+   for pos in ag_tuple_pos:
+      if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
+         currentAcc[pos] = 1e-6
 
-   # take care of different indexation if on negative strand
-   if strand == '-':
-      currentAcc = currentAcc[:length]
-      currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
-      currentDon = currentDon+[-inf]*(length-len(currentDon))
+      if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+         currentAcc[pos] = 1e-6
 
-      ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
-      gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+   for pos in gt_tuple_pos:
+      if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+         currentDon[pos] = 1e-6
 
-   if check_assertions:
-      assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
-      assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
-      assert len(currentAcc) == len(currentDon), pdb.set_trace()
-   else:
-      for pos in ag_tuple_pos:
-         if currentAcc[pos] == -inf:
-            currentAcc[pos] = 0.01
+      if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+         currentDon[pos] = 1e-6
 
-      for pos in gt_tuple_pos:
-         if currentDon[pos] == -inf:
-            currentDon[pos] = 0.01
+   assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
+   assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+   assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
 
    return genomicSeq, currentAcc, currentDon
-
-
-def checks():
-   start  = 1001
-   stop   = start+1234
-   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
-   for chromo in range(1,6):
-         p_seq,_p_acc,_p_don = get_seq_and_scores(chromo,'+',start,stop,dna_flat_files)
-         n_seq,n_acc,n_don = get_seq_and_scores(chromo,'-',start,stop,dna_flat_files)
-
-         print p_seq == reverse_complement(n_seq)
-
-         
-if __name__ == '__main__':
-   checks()
diff --git a/tests/.test_sequence_utils.py.swp b/tests/.test_sequence_utils.py.swp
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/tests/test_data/chr1.dna.flat b/tests/test_data/chr1.dna.flat
new file mode 100644 (file)
index 0000000..869713a
--- /dev/null
@@ -0,0 +1 @@
+acgtacgtagcgatctgctcgtacgtgctcgatcaggcagtczzuuctgt
diff --git a/tests/test_sequence_utils.py b/tests/test_sequence_utils.py
new file mode 100644 (file)
index 0000000..c2a3163
--- /dev/null
@@ -0,0 +1,118 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+import pdb
+import numpy
+import qpalma.sequence_utils
+
+   #flat_files = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/tests/test_data'
+
+   #begin =   0
+   #end   =  60
+   #dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(1,'+',begin,end,flat_files,True)
+   #dna = qpalma.sequence_utils.get_seq_and_scores(1,'+',begin,end,flat_files,True)
+   #print dna
+
+def check_positions(dna,acc,don,offset=0):
+   first_gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')][:offset]
+   first_ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and p<len(dna) and dna[p-1]=='a' and dna[p]=='g'][:offset]
+
+   first_acc_scores = [p for p,e in enumerate(acc) if e != -numpy.inf][:offset]
+   first_don_scores = [p for p,e in enumerate(don) if e != -numpy.inf][:offset]
+
+   last_gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')][-offset:]
+   last_ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and p<len(dna) and dna[p-1]=='a' and dna[p]=='g'][-offset:]
+
+   last_acc_scores = [p for p,e in enumerate(acc) if e != -numpy.inf][-offset:]
+   last_don_scores = [p for p,e in enumerate(don) if e != -numpy.inf][-offset:]
+
+   assert first_gt_tuple_pos == first_don_scores
+   assert first_ag_tuple_pos == first_acc_scores
+
+   assert last_gt_tuple_pos == last_don_scores
+   assert last_ag_tuple_pos == last_acc_scores
+
+
+def run():
+   flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   chromo   = 1
+   strand   = '+'
+   begin    = 0
+   end      = 100000000
+   dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
+
+   check_positions(dna,acc,don,100)
+
+
+def run2():
+   flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   chromo   = 8
+   strand   = '-'
+   begin    = 0
+   end      = 100000000
+   dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
+
+   check_positions(dna,acc,don,100)
+
+   pdb.set_trace()
+
+
+def run3():
+   flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   
+   begin = 0
+   end   = 100000000
+   full_pos_dna = qpalma.sequence_utils.get_seq_and_scores(1,'+',begin,end,flat_files,True)
+
+   begin = 0
+   end   = 500
+   dna = qpalma.sequence_utils.get_seq_and_scores(8,'+',begin,end,flat_files,True)
+
+   pos_dna = full_pos_dna[-500:]
+   r_dna = qpalma.sequence_utils.reverse_complement(dna)
+   pos_dna == r_dna
+   pdb.set_trace()
+   
+def run4():
+   flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   chromo   = 8
+   strand   = '-'
+   begin    = 200
+   end      = 1200
+   dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
+
+   check_positions(dna,acc,don)
+
+   pdb.set_trace()
+
+
+def check_negative_strand(b,e):
+   flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   chromo   = 8
+   strand   = '-'
+   begin    = b
+   end      = e
+   dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
+
+   pdb.set_trace()
+
+   check_positions(dna,acc,don)
+
+   print 'fine'
+
+def check_positive_strand(b,e):
+   flat_files =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   chromo   = 1
+   strand   = '+'
+   begin    = b
+   end      = e
+   dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
+
+   check_positions(dna,acc,don)
+   print 'fine'
+
+if __name__ == '__main__':
+   #run()
+   #run2()
+   #run3(
+   run4()