fb70c3a4f90eab2fe152c0e2ec62618eb5c3667d
[qpalma.git] / qpalma / sequence_utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import os
5 import pdb
6 import random
7 import re
8 import sys
9
10 from numpy.matlib import inf
11
12 from Genefinding import *
13 from genome_utils import load_genomic
14
15
16 def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
17 """
18 Now we want to use interval_query to get the predicted splice scores trained
19 on the TAIR sequence and annotation.
20 """
21
22 size = intervalEnd-intervalBegin
23 assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
24
25 acc = size*[0.0]
26 don = size*[0.0]
27
28 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
29 pos_size = new_intp()
30 intp_assign(pos_size,1)
31
32 # fetch acceptor scores
33 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
34 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
35
36 # fetch donor scores
37 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
38 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
39
40 return acc, don
41
42
43 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
44 """
45 This function expects an interval, chromosome and strand information and
46 returns then the genomic sequence of this interval and the associated scores.
47 """
48
49 assert genomicSeq_start < genomicSeq_stop
50
51 chrom = 'chr%d' % chr
52 genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
53 genomicSeq = genomicSeq.lower()
54
55 # check the obtained dna sequence
56 assert genomicSeq != '', 'load_genomic returned empty sequence!'
57
58 # all entries other than a c g t are set to n
59 no_base = re.compile('[^acgt]')
60 genomicSeq = no_base.sub('n',genomicSeq)
61
62 intervalBegin = genomicSeq_start-100
63 intervalEnd = genomicSeq_stop+100
64 seq_pos_offset = genomicSeq_start
65
66 currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
67
68 currentAcc = currentAcc[100:-98]
69 currentAcc = currentAcc[1:]
70 currentDon = currentDon[100:-100]
71
72 length = len(genomicSeq)
73 currentAcc = currentAcc[:length]
74
75 currentDon = currentDon+[-inf]*(length-len(currentDon))
76
77 ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
78 gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
79
80 assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
81 assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
82 assert len(currentAcc) == len(currentDon)
83
84 return genomicSeq, currentAcc, currentDon