From: fabio Date: Wed, 9 Jul 2008 17:16:25 +0000 (+0000) Subject: + wrapped sequence and score functions in order to reduce function calls X-Git-Url: http://git.tuebingen.mpg.de/?p=qpalma.git;a=commitdiff_plain;h=feec0e84e5dfb94352b73fafdabc857bca87e1b7 + wrapped sequence and score functions in order to reduce function calls git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9953 e1793c9e-67f9-0310-80fc-b846ff1f7b36 --- diff --git a/qpalma/sequence_utils.py b/qpalma/sequence_utils.py index f82cd62..064ff03 100644 --- a/qpalma/sequence_utils.py +++ b/qpalma/sequence_utils.py @@ -9,6 +9,7 @@ # import os +import os.path import pdb import re import subprocess @@ -26,7 +27,7 @@ class DataAccessionException: pass -def get_flatfile_size(filename): +def get_flat_file_size(filename): cmd = 'wc -c %s | cut -f1 -d \' \'' % filename obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE) out,err = obj.communicate() @@ -123,120 +124,136 @@ def create_bracket_seq(dna_seq,read_seq): return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq)) -def getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size=0,genomicSeq=''): - """ - Now we want to use interval_query to get the predicted splice scores trained - on the TAIR sequence and annotation. - """ - size = intervalEnd-intervalBegin - assert size > 1, 'Error (getSpliceScores): interval size is less than 2!' +class SeqSpliceInfo(): + + def __init__(self,dna_flat_files,chromo_list): + assert os.path.isdir(dna_flat_files) + self.flat_files = dna_flat_files + + self.chromo_sizes = {} + + print "Fetching chromosome sizes..." + for chromo in chromo_list: + filename = os.path.join(self.flat_files,'chr%d.dna.flat' % chromo) + total_size = get_flat_file_size(filename) + self.chromo_sizes[chromo] = total_size - acc = size*[0.0] - don = size*[0.0] + print "done" - interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd]) - pos_size = new_intp() - intp_assign(pos_size,1) - # fetch acceptor scores - sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s' - acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size) + def getSpliceScores(self,chromo,strand,intervalBegin,intervalEnd,genomicSeq=''): + """ + Now we want to use interval_query to get the predicted splice scores trained + on the TAIR sequence and annotation. + """ - #ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p 1][:100] + size = intervalEnd-intervalBegin + assert size > 1, 'Error (getSpliceScores): interval size is less than 2!' - # fetch donor scores - sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s' - don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size) + acc = size*[0.0] + don = size*[0.0] - #gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p 1][:100] - #pdb.set_trace() + interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd]) + pos_size = new_intp() + intp_assign(pos_size,1) - return acc, don + total_size = self.chromo_sizes[chromo] + # fetch acceptor scores + sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s' + acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size) -def fetch_sequence(chromo,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files): - chromo_string = 'chr%d' % chromo - #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False) - #print 'loading %s...' % chromo_string - genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,one_based=False) - genomicSeq = genomicSeq.strip().lower() + #ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p 1][:100] - # check the obtained dna sequence - assert genomicSeq != '', 'load_genomic returned empty sequence!' - - # all entries other than a c g t are set to n - no_base = re.compile('[^acgt]') - genomicSeq = no_base.sub('n',genomicSeq) + # fetch donor scores + sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s' + don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size) - return genomicSeq + #gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p 1][:100] + #pdb.set_trace() + return acc, don -def get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,only_seq=False): - """ - This function expects an interval, chromosome and strand information and - returns then the genomic sequence of this interval and the associated scores. - """ - # Because splice site scores are predicted using a window of the sequence the - # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence - # do not have any score predictions - NO_SCORE_WINDOW_SIZE = 205 - assert genomicSeq_start < genomicSeq_stop + def fetch_sequence(self,chromo,strand,genomicSeq_start,genomicSeq_stop): + chromo_string = 'chr%d' % chromo + genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,self.flat_files,one_based=False) + genomicSeq = genomicSeq.strip().lower() - genomicSeq = fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop,dna_flat_files) + # check the obtained dna sequence + assert genomicSeq != '', 'load_genomic returned empty sequence!' + + # all entries other than a c g t are set to n + no_base = re.compile('[^acgt]') + genomicSeq = no_base.sub('n',genomicSeq) - if only_seq: return genomicSeq - filename = os.path.join(dna_flat_files,'chr%d.dna.flat' % chromo) - total_size = get_flatfile_size(filename) + def get_seq_and_scores(self,chromo,strand,genomicSeq_start,genomicSeq_stop,only_seq=False): + """ + This function expects an interval, chromosome and strand information and + returns then the genomic sequence of this interval and the associated scores. + """ + # Because splice site scores are predicted using a window of the sequence the + # first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence + # do not have any score predictions + NO_SCORE_WINDOW_SIZE = 205 - # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size - lookup_offset = 0 - intervalBegin = max(genomicSeq_start-lookup_offset,0) - intervalEnd = min(genomicSeq_stop+lookup_offset,total_size-1) + assert genomicSeq_start < genomicSeq_stop - if chromo > 7: - print intervalBegin,intervalEnd,total_size - currentAcc, currentDon = getSpliceScores(chromo-7,'-',intervalBegin,intervalEnd,total_size,genomicSeq) - currentDon = [-inf]+currentDon[:-1] - else: - currentAcc, currentDon = getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size,genomicSeq) - currentAcc = currentAcc[2:] - currentDon = currentDon[1:] - - length = len(genomicSeq) - currentAcc = currentAcc+[-inf]*(length-len(currentAcc)) - currentDon = currentDon+[-inf]*(length-len(currentDon)) - - #acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1] - #don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0] - - gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p1 and p total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf: - currentAcc[pos] = 1e-6 + total_size = self.chromo_sizes[chromo] - for pos in gt_tuple_pos: - if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf: - currentDon[pos] = 1e-6 + # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size + lookup_offset = 0 + intervalBegin = max(genomicSeq_start-lookup_offset,0) + intervalEnd = min(genomicSeq_stop+lookup_offset,total_size-1) - if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf: - currentDon[pos] = 1e-6 - - assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace() - assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace() - assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace() - - return genomicSeq, currentAcc, currentDon + if chromo > 7: + print intervalBegin,intervalEnd,total_size + currentAcc, currentDon = self.getSpliceScores(chromo-7,'-',intervalBegin,intervalEnd,total_size,genomicSeq) + currentDon = [-inf]+currentDon[:-1] + else: + currentAcc, currentDon = self.getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size,genomicSeq) + currentAcc = currentAcc[2:] + currentDon = currentDon[1:] + + length = len(genomicSeq) + currentAcc = currentAcc+[-inf]*(length-len(currentAcc)) + currentDon = currentDon+[-inf]*(length-len(currentDon)) + + #acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1] + #don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0] + + gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p1 and p total_size - NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf: + currentAcc[pos] = 1e-6 + + for pos in gt_tuple_pos: + if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf: + currentDon[pos] = 1e-6 + + if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf: + currentDon[pos] = 1e-6 + + assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace() + assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace() + assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace() + + return genomicSeq, currentAcc, currentDon