+ moved sequence access functions to own module
[qpalma.git] / qpalma / sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4
5 def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
6 """
7 Now we want to use interval_query to get the predicted splice scores trained
8 on the TAIR sequence and annotation.
9 """
10
11 size = intervalEnd-intervalBegin
12 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
13
14 acc = size*[0.0]
15 don = size*[0.0]
16
17 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
18 pos_size = new_intp()
19 intp_assign(pos_size,1)
20
21 # fetch acceptor scores
22 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
23 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
24
25 # fetch donor scores
26 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
27 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
28
29 return acc, don
30
31
32 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
33 """
34 This function expects an interval, chromosome and strand information and
35 returns then the genomic sequence of this interval and the associated scores.
36 """
37
38 assert genomicSeq_start < genomicSeq_stop
39
40 chrom = 'chr%d' % chr
41 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
42 genomicSeq = genomicSeq.lower()
43
44 # check the obtained dna sequence
45 assert genomicSeq != '', 'load_genomic returned empty sequence!'
46
47 # all entries other than a c g t are set to n
48 no_base = re.compile('[^acgt]')
49 genomicSeq = no_base.sub('n',genomicSeq)
50
51 intervalBegin = genomicSeq_start-100
52 intervalEnd = genomicSeq_stop+100
53 seq_pos_offset = genomicSeq_start
54
55 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
56
57 currentAcc = currentAcc[100:-98]
58 currentAcc = currentAcc[1:]
59 currentDon = currentDon[100:-100]
60
61 length = len(genomicSeq)
62 currentAcc = currentAcc[:length]
63
64 currentDon = currentDon+[-inf]*(length-len(currentDon))
65
66 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
67 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
68
69 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
70 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
71 assert len(currentAcc) == len(currentDon)
72
73 return genomicSeq, currentAcc, currentDon