+ moved sequence access functions to own module
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 09:29:59 +0000 (09:29 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 09:29:59 +0000 (09:29 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9300 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/sequence_utils.py [new file with mode: 0644]
scripts/qpalma_main.py

diff --git a/qpalma/sequence_utils.py b/qpalma/sequence_utils.py
new file mode 100644 (file)
index 0000000..0ebb5bc
--- /dev/null
@@ -0,0 +1,73 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+
+def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
+   """
+   Now we want to use interval_query to get the predicted splice scores trained
+   on the TAIR sequence and annotation.
+   """
+
+   size = intervalEnd-intervalBegin
+   assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
+
+   acc = size*[0.0]
+   don = size*[0.0]
+
+   interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
+   pos_size = new_intp()
+   intp_assign(pos_size,1)
+
+   # fetch acceptor scores
+   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
+   acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+
+   # fetch donor scores
+   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
+   don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+
+   return acc, don
+
+
+def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
+   """
+   This function expects an interval, chromosome and strand information and
+   returns then the genomic sequence of this interval and the associated scores.
+   """
+
+   assert genomicSeq_start < genomicSeq_stop
+
+   chrom         = 'chr%d' % chr
+   genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
+   genomicSeq = genomicSeq.lower()
+
+   # check the obtained dna sequence
+   assert genomicSeq != '', 'load_genomic returned empty sequence!'
+   
+   # all entries other than a c g t are set to n
+   no_base = re.compile('[^acgt]')
+   genomicSeq = no_base.sub('n',genomicSeq)
+
+   intervalBegin  = genomicSeq_start-100
+   intervalEnd    = genomicSeq_stop+100
+   seq_pos_offset = genomicSeq_start
+
+   currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
+
+   currentAcc = currentAcc[100:-98]
+   currentAcc = currentAcc[1:]
+   currentDon = currentDon[100:-100]
+
+   length = len(genomicSeq)
+   currentAcc = currentAcc[:length]
+
+   currentDon = currentDon+[-inf]*(length-len(currentDon))
+
+   ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
+   gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+
+   assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
+   assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+   assert len(currentAcc) == len(currentDon)
+
+   return genomicSeq, currentAcc, currentDon
index 4ce1a39..d32a2e7 100644 (file)
@@ -20,7 +20,7 @@ import pdb
 import re
 import os.path
 
-from compile_dataset import getSpliceScores, get_seq_and_scores
+from qpalma.sequence_utils import getSpliceScores, get_seq_and_scores
 
 import numpy
 from numpy.matlib import mat,zeros,ones,inf