+ renaming business
[qpalma.git] / qpalma / generateEvaluationData.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import mat,zeros,ones,inf
5 import random
6 import pdb
7 import os.path
8 import cPickle
9
10 class Dataset:
11 pass
12
13 def sample(population, k):
14 """Chooses k random elements from a population sequence. """
15 n = len(population)
16 result = [None] * k
17 for i in xrange(k):
18 j = int(random.random() * n)
19 result[i] = population[j]
20 return result
21
22 def generateData(numExamples):
23 dna_len = 216
24 est_len = 36
25 random.seed(14)
26
27 letters = ['a','c','g','t']
28
29 Sequences = []
30 Acceptors = []
31 Donors = []
32 Exons = []
33 Ests = []
34
35 for i in range(numExamples):
36 dna = ''.join(sample(letters, dna_len))
37 Sequences.append(dna)
38
39 Acceptors.append([0.0]*dna_len)
40 Donors.append([0.0]*dna_len)
41
42 currentExon = zeros((2,2))
43 currentExon[0,0] = 0
44 currentExon[0,1] = 72
45 currentExon[1,0] = 144
46 currentExon[1,1] = 216
47
48 Exons.append(currentExon)
49
50 est = ''.join(sample(letters, est_len))
51 Ests.append(est)
52
53 preNr = 15
54 middleNr = 6
55 sufNr = 15
56 Qualities = [[40]*preNr + [-1]*middleNr + [40]*sufNr]*numExamples
57
58 return Sequences, Acceptors, Donors, Exons, Ests, Qualities
59
60 def generateData2(numExamples):
61 est_len = 36
62 random.seed(14)
63
64 letters = ['a','c','g','t']
65
66 Sequences = []
67 Acceptors = []
68 Donors = []
69 Exons = []
70 Ests = []
71
72 for i in range(numExamples):
73 dna_len = random.randint(200,400)
74 dna = ''.join(sample(letters, dna_len))
75 #Sequences.append(dna)
76 begin = random.randint(70,dna_len-70)
77 end = begin+60
78 dna = dna[0:begin] + 't'*18 + 'gt' + 'a'*20 + 'ag' + 't'*18 + dna[end:]
79 dna_len = len(dna)
80 Sequences.append(dna)
81
82 currentDon = [-3.0]*dna_len
83 currentAcc = [-3.0]*dna_len
84
85 currentDon[begin+18+1] = 3.0
86 currentDon[begin+18+2] = 3.0
87
88 currentAcc[end-18-1] = 3.0
89 currentAcc[end-18-2] = 3.0
90
91 #pdb.set_trace()
92
93 Donors.append(currentDon)
94 Acceptors.append(currentAcc)
95
96 currentExon = zeros((2,2))
97 currentExon[0,0] = 0
98 currentExon[0,1] = begin+18
99 currentExon[1,0] = end-18
100 currentExon[1,1] = dna_len-1
101
102 Exons.append(currentExon)
103
104 #est = ''.join(sample(letters, est_len))
105 est = dna[begin-18:begin] + dna[end+1:end+19]
106 est = 't'*est_len
107 Ests.append(est)
108
109 preNr = 15
110 middleNr = 6
111 sufNr = 15
112 #Qualities = [[40]*preNr + [-1]*middleNr + [40]*sufNr]*numExamples
113 Qualities = [[40]*est_len]*numExamples
114
115 return Sequences, Acceptors, Donors, Exons, Ests, Qualities
116
117 def loadArtificialData(numExamples):
118 filename = 'artificial_dset_%d'%numExamples
119 if not os.path.exists(filename):
120 s,a,d,e,est,q = generateData2(numExamples)
121 dset = Dataset()
122 dset.Sequences = s
123 dset.Acceptor = a
124 dset.Donors = d
125 dset.Exons = e
126 dset.Ests = est
127 dset.Qualities = q
128 cPickle.dump(dset,open(filename,'w+'))
129 else:
130 dset = cPickle.load(open(filename))
131 s = dset.Sequences
132 a = dset.Acceptor
133 d = dset.Donors
134 e = dset.Exons
135 est = dset.Ests
136 q = dset.Qualities
137
138 return s,a,d,e,est,q
139
140 if __name__ == '__main__':
141 Sequences, Acceptors, Donors, Exons, Ests, Qualities = generateData(10)
142 print Acceptors
143 print Donors
144 print Exons
145 print Ests
146 print Qualities