extended_alphabet = ['-','a','c','g','t','n','[',']']
alphabet = ['-','a','c','g','t','n']
+
+class DataAccessionException:
+ pass
+
+
def get_flatfile_size(filename):
cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
if err != '':
print 'Error occurred while trying to obtain file size'
+ raise DataAccessionException
+
return int(out)
return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
-
-def getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size=0):
+def getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size=0,genomicSeq=''):
"""
Now we want to use interval_query to get the predicted splice scores trained
on the TAIR sequence and annotation.
# fetch acceptor scores
sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
- acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+ acc = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+
+ #ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g'][:100]
+ #acc_pos = [p for p,e in enumerate(acc) if e != -inf and p > 1][:100]
# fetch donor scores
sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
- don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+ don = doIntervalQuery(chromo,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
+
+ #gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+ #don_pos = [p for p,e in enumerate(don) if e != -inf and p > 1][:100]
+ #pdb.set_trace()
return acc, don
-def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,only_seq=False):
+def fetch_sequence(chromo,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
+ chromo_string = 'chr%d' % chromo
+ #genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
+ #print 'loading %s...' % chromo_string
+ genomicSeq = load_genomic(chromo_string,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,one_based=False)
+ genomicSeq = genomicSeq.strip().lower()
+
+ # check the obtained dna sequence
+ assert genomicSeq != '', 'load_genomic returned empty sequence!'
+
+ # all entries other than a c g t are set to n
+ no_base = re.compile('[^acgt]')
+ genomicSeq = no_base.sub('n',genomicSeq)
+
+ return genomicSeq
+
+
+def get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files,only_seq=False):
"""
This function expects an interval, chromosome and strand information and
returns then the genomic sequence of this interval and the associated scores.
"""
-
- # because splice site scores are predicted using a window of the sequence the
+ # Because splice site scores are predicted using a window of the sequence the
# first and the last NO_SCORE_WINDOW_SIZE nucleotides of a genomic sequence
# do not have any score predictions
- NO_SCORE_WINDOW_SIZE = 250
+ NO_SCORE_WINDOW_SIZE = 205
assert genomicSeq_start < genomicSeq_stop
- chrom = 'chr%d' % chr
- genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
- genomicSeq = genomicSeq.lower()
-
- # check the obtained dna sequence
- assert genomicSeq != '', 'load_genomic returned empty sequence!'
-
- # all entries other than a c g t are set to n
- no_base = re.compile('[^acgt]')
- genomicSeq = no_base.sub('n',genomicSeq)
+ genomicSeq = fetch_sequence(chromo,'+',genomicSeq_start,genomicSeq_stop,dna_flat_files)
if only_seq:
return genomicSeq
- fn = 'chr%d.dna.flat' % chr
- filename = os.path.join(dna_flat_files,fn)
+ filename = os.path.join(dna_flat_files,'chr%d.dna.flat' % chromo)
total_size = get_flatfile_size(filename)
- intervalBegin = genomicSeq_start-100
- intervalEnd = genomicSeq_stop+100
-
- # shorten the intervalEnd to the maximal size of the genomic sequence if it
- # would exceed this
- if intervalEnd > total_size:
- intervalEnd = total_size-1
-
- # we have assertions below checking whether every gt or ag has a splice score
- # not equal to -inf if the gt or ag are within the first NO_SCORE_WINDOW_SIZE
- # nucleotide then they have no scores at all
- check_assertions = True
- if intervalEnd > total_size - NO_SCORE_WINDOW_SIZE:
- check_assertions = False
-
- if intervalBegin < NO_SCORE_WINDOW_SIZE:
- check_assertions = False
+ # shorten the intervalEnd to the maximal size of the genomic sequence if it would exceed this size
+ lookup_offset = 0
+ intervalBegin = max(genomicSeq_start-lookup_offset,0)
+ intervalEnd = min(genomicSeq_stop+lookup_offset,total_size-1)
- currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size)
-
- currentAcc = currentAcc[100:-98]
- currentAcc = currentAcc[1:]
- currentDon = currentDon[100:-100]
+ if chromo > 7:
+ print intervalBegin,intervalEnd,total_size
+ currentAcc, currentDon = getSpliceScores(chromo-7,'-',intervalBegin,intervalEnd,total_size,genomicSeq)
+ currentDon = [-inf]+currentDon[:-1]
+ else:
+ currentAcc, currentDon = getSpliceScores(chromo,strand,intervalBegin,intervalEnd,total_size,genomicSeq)
+ currentAcc = currentAcc[2:]
+ currentDon = currentDon[1:]
length = len(genomicSeq)
+ currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
+ currentDon = currentDon+[-inf]*(length-len(currentDon))
+
+ #acc_pos = [p for p,e in enumerate(currentAcc) if e != -inf and p > 1]
+ #don_pos = [p for p,e in enumerate(currentDon) if e != -inf and p > 0]
- # indices for positive strand
- if strand == '+':
- currentAcc = currentAcc[:length]
- currentDon = currentDon+[-inf]*(length-len(currentDon))
+ gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+ ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
- ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
- gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+ #pdb.set_trace()
+ #return genomicSeq, currentAcc, currentDon
+
+ for pos in ag_tuple_pos:
+ if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentAcc[pos] == -inf:
+ currentAcc[pos] = 1e-6
- # take care of different indexation if on negative strand
- if strand == '-':
- currentAcc = currentAcc[:length]
- currentAcc = currentAcc+[-inf]*(length-len(currentAcc))
- currentDon = currentDon+[-inf]*(length-len(currentDon))
+ if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+ currentAcc[pos] = 1e-6
- ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and p<len(genomicSeq) and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
- gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+ for pos in gt_tuple_pos:
+ if pos+intervalBegin < NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+ currentDon[pos] = 1e-6
- if check_assertions:
- assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
- assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
- assert len(currentAcc) == len(currentDon), pdb.set_trace()
- else:
- for pos in ag_tuple_pos:
- if currentAcc[pos] == -inf:
- currentAcc[pos] = 0.01
+ if pos+intervalBegin > total_size - NO_SCORE_WINDOW_SIZE and currentDon[pos] == -inf:
+ currentDon[pos] = 1e-6
- for pos in gt_tuple_pos:
- if currentDon[pos] == -inf:
- currentDon[pos] = 0.01
+ assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
+ assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+ assert len(genomicSeq) == len(currentAcc) == len(currentDon), pdb.set_trace()
return genomicSeq, currentAcc, currentDon
-
-
-def checks():
- start = 1001
- stop = start+1234
- dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
- for chromo in range(1,6):
- p_seq,_p_acc,_p_don = get_seq_and_scores(chromo,'+',start,stop,dna_flat_files)
- n_seq,n_acc,n_don = get_seq_and_scores(chromo,'-',start,stop,dna_flat_files)
-
- print p_seq == reverse_complement(n_seq)
-
-
-if __name__ == '__main__':
- checks()
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import pdb
+import numpy
+import qpalma.sequence_utils
+
+ #flat_files = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/tests/test_data'
+
+ #begin = 0
+ #end = 60
+ #dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(1,'+',begin,end,flat_files,True)
+ #dna = qpalma.sequence_utils.get_seq_and_scores(1,'+',begin,end,flat_files,True)
+ #print dna
+
+def check_positions(dna,acc,don,offset=0):
+ first_gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')][:offset]
+ first_ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and p<len(dna) and dna[p-1]=='a' and dna[p]=='g'][:offset]
+
+ first_acc_scores = [p for p,e in enumerate(acc) if e != -numpy.inf][:offset]
+ first_don_scores = [p for p,e in enumerate(don) if e != -numpy.inf][:offset]
+
+ last_gt_tuple_pos = [p for p,e in enumerate(dna) if p>0 and p<len(dna)-1 and e=='g' and (dna[p+1]=='t' or dna[p+1]=='c')][-offset:]
+ last_ag_tuple_pos = [p for p,e in enumerate(dna) if p>1 and p<len(dna) and dna[p-1]=='a' and dna[p]=='g'][-offset:]
+
+ last_acc_scores = [p for p,e in enumerate(acc) if e != -numpy.inf][-offset:]
+ last_don_scores = [p for p,e in enumerate(don) if e != -numpy.inf][-offset:]
+
+ assert first_gt_tuple_pos == first_don_scores
+ assert first_ag_tuple_pos == first_acc_scores
+
+ assert last_gt_tuple_pos == last_don_scores
+ assert last_ag_tuple_pos == last_acc_scores
+
+
+def run():
+ flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+ chromo = 1
+ strand = '+'
+ begin = 0
+ end = 100000000
+ dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
+
+ check_positions(dna,acc,don,100)
+
+
+def run2():
+ flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+ chromo = 8
+ strand = '-'
+ begin = 0
+ end = 100000000
+ dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
+
+ check_positions(dna,acc,don,100)
+
+ pdb.set_trace()
+
+
+def run3():
+ flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+
+ begin = 0
+ end = 100000000
+ full_pos_dna = qpalma.sequence_utils.get_seq_and_scores(1,'+',begin,end,flat_files,True)
+
+ begin = 0
+ end = 500
+ dna = qpalma.sequence_utils.get_seq_and_scores(8,'+',begin,end,flat_files,True)
+
+ pos_dna = full_pos_dna[-500:]
+ r_dna = qpalma.sequence_utils.reverse_complement(dna)
+ pos_dna == r_dna
+ pdb.set_trace()
+
+def run4():
+ flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+ chromo = 8
+ strand = '-'
+ begin = 200
+ end = 1200
+ dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
+
+ check_positions(dna,acc,don)
+
+ pdb.set_trace()
+
+
+def check_negative_strand(b,e):
+ flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+ chromo = 8
+ strand = '-'
+ begin = b
+ end = e
+ dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
+
+ pdb.set_trace()
+
+ check_positions(dna,acc,don)
+
+ print 'fine'
+
+def check_positive_strand(b,e):
+ flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+ chromo = 1
+ strand = '+'
+ begin = b
+ end = e
+ dna,acc,don = qpalma.sequence_utils.get_seq_and_scores(chromo,strand,begin,end,flat_files)
+
+ check_positions(dna,acc,don)
+ print 'fine'
+
+if __name__ == '__main__':
+ #run()
+ #run2()
+ #run3(
+ run4()