#
import os
+import os.path
import pdb
import re
import subprocess
pass
-def get_flatfile_size(filename):
+def get_flat_file_size(filename):
cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
out,err = obj.communicate()
return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
-def getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size=0,genomicSeq=''):
- """
- Now we want to use interval_query to get the predicted splice scores trained
- on the TAIR sequence and annotation.
- """
- size = intervalEnd-intervalBegin
- assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
+class SeqSpliceInfo():
+
+ def __init__(self,dna_flat_files,chromo_list):
+ assert os.path.isdir(dna_flat_files)
+ self.flat_files = dna_flat_files
+
+ self.chromo_sizes = {}
+
+ print "Fetching chromosome sizes..."
+ for chromo in chromo_list:
+ filename = os.path.join(self.flat_files,'chr%d.dna.flat' % chromo)
+ total_size = get_flat_file_size(filename)
+ self.chromo_sizes[chromo] = total_size
- acc = size*[0.0]
- don = size*[0.0]
+ print "done"
- interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
- pos_size = new_intp()
- intp_assign(pos_size,1)
- # fetch acceptor scores
- sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
- acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+ def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''):
+ """
+ Now we want to use interval_query to get the predicted splice scores trained
+ on the TAIR sequence and annotation.
+ """
- #ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g'][:100]
- #acc_pos = [p for p,e in enumerate(acc) if e != -inf and p > 1][:100]
+ size = intervalEnd-intervalBegin
+ assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
- # fetch donor scores
- sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
- don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+ acc = size*[0.0]
+ don = size*[0.0]
- #gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
- #don_pos = [p for p,e in enumerate(don) if e != -inf and p > 1][:100]
- #pdb.set_trace()
+ interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
+ pos_size = new_intp()
+ intp_assign(pos_size,1)
- return acc, don
+ total_size = self.chromo_sizes[chromo]
+ # fetch acceptor scores
+ sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
+ acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
-def fetch_sequence(chromo,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
- chromo_string = 'chr%d' % chromo
- #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
- #print 'loading %s...' % chromo_string
- genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,one_based=False)
- genomicSeq = genomicSeq.strip().lower()
+ #ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g'][:100]
+ #acc_pos = [p for p,e in enumerate(acc) if e != -inf and p > 1][:100]
- # check the obtained dna sequence
- assert genomicSeq != '', 'load_genomic returned empty sequence!'
-
- # all entries other than a c g t are set to n
- no_base = re.compile('[^acgt]')
- genomicSeq = no_base.sub('n',genomicSeq)
+ # fetch donor scores
+ sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
+ don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
- return genomicSeq
+ #gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+ #don_pos = [p for p,e in enumerate(don) if e != -inf and p > 1][:100]
+ #pdb.set_trace()
+ return acc, don
-def get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,only_seq=False):
- """
- This function expects an interval, chromosome and strand information and
- returns then the genomic sequence of this interval and the associated scores.
- """
- # Because splice site scores are predicted using a window of the sequence the
- # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
- # do not have any score predictions
- NO_SCORE_WINDOW_SIZE = 205
- assert genomicSeq_start < genomicSeq_stop
+ def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop):
+ chromo_string = 'chr%d' % chromo
+ genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False)
+ genomicSeq = genomicSeq.strip().lower()
- genomicSeq = fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop,dna_flat_files)
+ # check the obtained dna sequence
+ assert genomicSeq != '', 'load_genomic returned empty sequence!'
+
+ # all entries other than a c g t are set to n
+ no_base = re.compile('[^acgt]')
+ genomicSeq = no_base.sub('n',genomicSeq)
- if only_seq:
return genomicSeq
- filename = os.path.join(dna_flat_files,'chr%d.dna.flat' % chromo)
- total_size = get_flatfile_size(filename)
+ def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False):
+ """
+ This function expects an interval, chromosome and strand information and
+ returns then the genomic sequence of this interval and the associated scores.
+ """
+ # Because splice site scores are predicted using a window of the sequence the
+ # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
+ # do not have any score predictions
+ NO_SCORE_WINDOW_SIZE = 205
- # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
- lookup_offset = 0
- intervalBegin = max(genomicSeq_start-lookup_offset,0)
- intervalEnd = min(genomicSeq_stop+lookup_offset,total_size-1)
+ assert genomicSeq_start < genomicSeq_stop
- if chromo > 7:
- print intervalBegin,intervalEnd,total_size
- currentAcc, currentDon = getSpliceScores(chromo-7,'-',intervalBegin,intervalEnd,total_size,genomicSeq)
- currentDon = [-inf]+currentDon[:-1]
- else:
- currentAcc, currentDon = getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size,genomicSeq)
- currentAcc = currentAcc[2:]
- currentDon = currentDon[1:]
-
- length = len(genomicSeq)
- currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
- currentDon = currentDon+[-inf]*(length-len(currentDon))
-
- #acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
- #don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
-
- gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
- ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
+ genomicSeq = self.fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop)
- #pdb.set_trace()
- #return genomicSeq, currentAcc, currentDon
-
- for pos in ag_tuple_pos:
- if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
- currentAcc[pos] = 1e-6
+ if only_seq:
+ return genomicSeq
- if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
- currentAcc[pos] = 1e-6
+ total_size = self.chromo_sizes[chromo]
- for pos in gt_tuple_pos:
- if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
- currentDon[pos] = 1e-6
+ # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
+ lookup_offset = 0
+ intervalBegin = max(genomicSeq_start-lookup_offset,0)
+ intervalEnd = min(genomicSeq_stop+lookup_offset,total_size-1)
- if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
- currentDon[pos] = 1e-6
-
- assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
- assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
- assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
-
- return genomicSeq, currentAcc, currentDon
+ if chromo > 7:
+ print intervalBegin,intervalEnd,total_size
+ currentAcc, currentDon = self.getSpliceScores(chromo-7,'-',intervalBegin,intervalEnd,total_size,genomicSeq)
+ currentDon = [-inf]+currentDon[:-1]
+ else:
+ currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size,genomicSeq)
+ currentAcc = currentAcc[2:]
+ currentDon = currentDon[1:]
+
+ length = len(genomicSeq)
+ currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
+ currentDon = currentDon+[-inf]*(length-len(currentDon))
+
+ #acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
+ #don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
+
+ gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+ ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
+
+ #pdb.set_trace()
+ #return genomicSeq, currentAcc, currentDon
+
+ for pos in ag_tuple_pos:
+ if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
+ currentAcc[pos] = 1e-6
+
+ if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
+ currentAcc[pos] = 1e-6
+
+ for pos in gt_tuple_pos:
+ if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+ currentDon[pos] = 1e-6
+
+ if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+ currentDon[pos] = 1e-6
+
+ assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
+ assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+ assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
+
+ return genomicSeq, currentAcc, currentDon