fd68fe92142966c383d19030164ba8da0a77932d
[qpalma.git] / scripts / Lookup.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import random
6 import os
7 import re
8 import pdb
9 import random
10
11 from numpy.matlib import inf
12 from qpalma.sequence_utils import get_seq_and_scores,get_flatfile_size
13
14
15 class Lookup:
16 """
17 Speed up genomic and splice site score queries by prefetching the sequences
18 and the scores once for all chromosomes.
19 """
20
21 def __init__(self,dna_flat_files):
22 self.dna_flat_files = dna_flat_files
23
24 self.chromosomes = range(1,6)
25 self.strands = ['+','-']
26
27 self.genomicSequencesP = [0]*(len(self.chromosomes)+1)
28 self.acceptorScoresP = [0]*(len(self.chromosomes)+1)
29 self.donorScoresP = [0]*(len(self.chromosomes)+1)
30
31 self.genomicSequencesN = [0]*(len(self.chromosomes)+1)
32 self.acceptorScoresN = [0]*(len(self.chromosomes)+1)
33 self.donorScoresN = [0]*(len(self.chromosomes)+1)
34
35 #for chrId in self.chromosomes:
36 for chrId in range(1,6):
37 for strandId in self.strands:
38 print (chrId,strandId)
39 self.prefetch_seq_and_scores(chrId,strandId)
40
41
42 def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
43 assert chromo in self.chromosomes
44 assert strand in self.strands
45
46 if strand == '+':
47 currentSequence = self.genomicSequencesP[chromo][start:stop+1]
48 currentDonorScores = self.donorScoresP[chromo][start:stop+1]
49 currentAcceptorScores = self.acceptorScoresP[chromo][start:stop+1]
50 elif strand == '-':
51 end = len(self.genomicSequencesN[chromo])
52 currentSequence = self.genomicSequencesN[chromo][end-stop-1:end-start]
53 #length = len(currentSequence)
54 currentDonorScores = self.donorScoresN[chromo][end-stop-1:end-start]
55 #currentDonorScores += [-inf]*(length-len(currentDonorScores))
56 currentAcceptorScores = self.acceptorScoresN[chromo][end-stop-1:end-start]
57 else:
58 pdb.set_trace()
59
60 return currentSequence.lower(), currentAcceptorScores, currentDonorScores
61
62
63 def prefetch_seq_and_scores(self,chromo,strand):
64 """
65 This function expects an interval, chromosome and strand information and
66 returns then the genomic sequence of this interval and the associated scores.
67 """
68
69 fn = 'chr%d.dna.flat' % chromo
70 filename = os.path.join(self.dna_flat_files,fn)
71 size = get_flatfile_size(filename)
72
73 # we do not use the first/last 205 position as they are can not be
74 # predicted due to the window size used for splice site prediction
75 genomicSeq_start = 205
76 genomicSeq_stop = size-205
77 genomicSeq,currentAcc,currentDon = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,self.dna_flat_files)
78 genomicSeq = genomicSeq.lower()
79
80 U = 'u'*205
81 INF_F = [-inf]*205
82
83 if strand == '+':
84 self.genomicSequencesP[chromo] = U + genomicSeq + U
85 self.acceptorScoresP[chromo] = INF_F + currentAcc + INF_F
86 self.donorScoresP[chromo] = INF_F + currentDon + INF_F
87 elif strand == '-':
88 self.genomicSequencesN[chromo] = U + genomicSeq + U
89 self.acceptorScoresN[chromo] = INF_F + currentAcc + INF_F
90 self.donorScoresN[chromo] = INF_F + currentDon + INF_F
91 else:
92 assert False
93
94
95 def getSpliceScores(self,chr,strand,intervalBegin,intervalEnd):
96 """
97 Now we want to use interval_query to get the predicted splice scores trained
98 on the TAIR sequence and annotation.
99 """
100
101 assert intervalEnd > intervalBegin
102
103 size = intervalEnd-intervalBegin
104
105 acc = size*[0.0]
106 don = size*[0.0]
107
108 interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
109 pos_size = new_intp()
110 intp_assign(pos_size,1)
111
112 # fetch acceptor scores
113 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
114 acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
115
116 # fetch donor scores
117 sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
118 don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
119
120 return acc, don
121
122
123 if __name__ == '__main__':
124 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
125 lt1 = Lookup(dna_flat_files)