from genome_utils import load_genomic
-def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
+def reverse_complement(seq):
+ """
+ This function takes a read in plain or bracket notation and returns the
+ reverse complement of it.
+ I.e.
+
+ est = cgt[ac][tg]a
+
+ leads to
+
+ rev_comp = t[ac][tg]acg
+ """
+
+ bpos = seq.find('[')
+ rc = lambda x: {'a':'t','c':'g','g':'c','t':'a'}[x]
+
+ # check first whether seq contains no brackets at all
+ if bpos == -1:
+ ret_val = map(rc,seq)
+ ret_val.reverse()
+ ret_val = "".join(ret_val)
+ else:
+ brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','[':'[',']':']'}[x]
+
+ # first_part is the part of the seq up to the first occurrence of a
+ # bracket
+ first_part = seq[:bpos]
+ first_part = map(rc,first_part)
+ first_part.reverse()
+ first_part = "".join(first_part)
+
+ # inside brackets has to be complemented but NOT reversed
+ inside_brackets = seq[bpos+1:bpos+3]
+ inside_brackets = "".join(map(rc,inside_brackets))
+
+ ret_val = '%s[%s]%s'%(reverse_complement(seq[bpos+4:]),inside_brackets,first_part)
+
+ return ret_val
+
+
+def unbracket_est(est):
+ """
+ This function takes a read in bracket notation and restores the read sequence from it.
+ I.e.
+
+ est = cgt[ac][tg]aa
+
+ leads to
+
+ result = cgtcgaa
+
+ so the second entry within brackets is the base on the read whereas the first
+ entry is the base from the dna.
+ """
+
+ new_est = ''
+ e = 0
+
+ while True:
+ if e >= len(est):
+ break
+
+ if est[e] == '[':
+ new_est += est[e+2]
+ e += 4
+ else:
+ new_est += est[e]
+ e += 1
+
+ return "".join(new_est).lower()
+
+
+def create_bracket_seq(dna_seq,read_seq):
+ """
+ This function takes a dna sequence and a read sequence and returns the
+ bracket format of the match/mismatches i.e.
+
+ dna : aaa
+ read: aac
+ is written in bracket notation: aa[ac]
+ """
+ assert len(dna_seq) == len(read_seq)
+ return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
+
+
+def getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size=0):
"""
Now we want to use interval_query to get the predicted splice scores trained
on the TAIR sequence and annotation.
# fetch acceptor scores
sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
- acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+ acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
# fetch donor scores
sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
- don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+ don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename,total_size)
return acc, don
no_base = re.compile('[^acgt]')
genomicSeq = no_base.sub('n',genomicSeq)
- intervalBegin = genomicSeq_start-100
- intervalEnd = genomicSeq_stop+100
- seq_pos_offset = genomicSeq_start
+ if strand == '+':
+ intervalBegin = genomicSeq_start-100
+ intervalEnd = genomicSeq_stop+100
+
+ currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
+
+ currentAcc = currentAcc[100:-98]
+ currentAcc = currentAcc[1:]
+ currentDon = currentDon[100:-100]
+
+ length = len(genomicSeq)
+ currentAcc = currentAcc[:length]
+
+ currentDon = currentDon+[-inf]*(length-len(currentDon))
+
+ ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
+ gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+
+ assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
+ assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+ assert len(currentAcc) == len(currentDon)
+
+ return genomicSeq, currentAcc, currentDon
+
+ # build reverse complement if on negative strand
+ if strand == '-':
+ genomicSeq = reverse_complement(genomicSeq)
+
+ fn = 'chr%d.dna.flat' % chr
+ filename = os.path.join(dna_flat_files,fn)
+ cmd = 'wc -c %s | cut -f1 -d \' \'' % filename
+ import subprocess
+ obj = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+ out,err = obj.communicate()
+
+ if err != '':
+ print 'Error occurred while trying to obtain file size'
+ end = int(out)
+
+ print 'size is %d' % end
+
+ intervalBegin = genomicSeq_start-100
+ intervalEnd = genomicSeq_stop+100
- currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
+ total_size = end
+ currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,total_size)
- currentAcc = currentAcc[100:-98]
- currentAcc = currentAcc[1:]
- currentDon = currentDon[100:-100]
+ currentAcc = currentAcc[100:-98]
+ currentAcc = currentAcc[1:]
+ currentDon = currentDon[100:-100]
- length = len(genomicSeq)
- currentAcc = currentAcc[:length]
+ length = len(genomicSeq)
+ currentAcc = currentAcc[:length]
- currentDon = currentDon+[-inf]*(length-len(currentDon))
+ currentDon = currentDon+[-inf]*(length-len(currentDon))
- ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
- gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+ ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g']
+ gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
- assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
- assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
- assert len(currentAcc) == len(currentDon)
+ assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
+ assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+ assert len(currentAcc) == len(currentDon)
- return genomicSeq, currentAcc, currentDon
+ return genomicSeq, currentAcc, currentDon