4b5b7a04affd236664707812c137b7fe9dfd9d9d
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 from numpy.matlib import mat,zeros,ones,inf
5 import random
6 import pdb
7 import os.path
8 import cPickle
10 class Dataset:
11 pass
13 def sample(population, k):
14 """Chooses k random elements from a population sequence. """
15 n = len(population)
16 result = [None] * k
17 for i in xrange(k):
18 j = int(random.random() * n)
19 result[i] = population[j]
20 return result
22 def generateData(numExamples):
23 dna_len = 216
24 est_len = 36
25 random.seed(14)
27 letters = ['a','c','g','t']
29 Sequences = []
30 Acceptors = []
31 Donors = []
32 Exons = []
33 Ests = []
35 for i in range(numExamples):
36 dna = ''.join(sample(letters, dna_len))
37 Sequences.append(dna)
39 Acceptors.append([0.0]*dna_len)
40 Donors.append([0.0]*dna_len)
42 currentExon = zeros((2,2))
43 currentExon[0,0] = 0
44 currentExon[0,1] = 72
45 currentExon[1,0] = 144
46 currentExon[1,1] = 216
48 Exons.append(currentExon)
50 est = ''.join(sample(letters, est_len))
51 Ests.append(est)
53 preNr = 15
54 middleNr = 6
55 sufNr = 15
56 Qualities = [[40]*preNr + [-1]*middleNr + [40]*sufNr]*numExamples
58 return Sequences, Acceptors, Donors, Exons, Ests, Qualities
60 def generateData2(numExamples):
61 est_len = 36
62 random.seed(14)
64 letters = ['a','c','g','t']
66 Sequences = []
67 Acceptors = []
68 Donors = []
69 Exons = []
70 Ests = []
72 for i in range(numExamples):
73 dna_len = random.randint(200,400)
74 dna = ''.join(sample(letters, dna_len))
75 #Sequences.append(dna)
76 begin = random.randint(70,dna_len-70)
77 end = begin+60
78 dna = dna[0:begin] + 't'*18 + 'gt' + 'a'*20 + 'ag' + 't'*18 + dna[end:]
79 dna_len = len(dna)
80 Sequences.append(dna)
82 currentDon = [-3.0]*dna_len
83 currentAcc = [-3.0]*dna_len
85 currentDon[begin+18+1] = 3.0
86 currentDon[begin+18+2] = 3.0
88 currentAcc[end-18-1] = 3.0
89 currentAcc[end-18-2] = 3.0
91 #pdb.set_trace()
93 Donors.append(currentDon)
94 Acceptors.append(currentAcc)
96 currentExon = zeros((2,2))
97 currentExon[0,0] = 0
98 currentExon[0,1] = begin+18
99 currentExon[1,0] = end-18
100 currentExon[1,1] = dna_len-1
102 Exons.append(currentExon)
104 #est = ''.join(sample(letters, est_len))
105 est = dna[begin-18:begin] + dna[end+1:end+19]
106 est = 't'*est_len
107 Ests.append(est)
109 preNr = 15
110 middleNr = 6
111 sufNr = 15
112 #Qualities = [[40]*preNr + [-1]*middleNr + [40]*sufNr]*numExamples
113 Qualities = [[40]*est_len]*numExamples
115 return Sequences, Acceptors, Donors, Exons, Ests, Qualities
118 filename = 'artificial_dset_%d'%numExamples
119 if not os.path.exists(filename):
120 s,a,d,e,est,q = generateData2(numExamples)
121 dset = Dataset()
122 dset.Sequences = s
123 dset.Acceptor = a
124 dset.Donors = d
125 dset.Exons = e
126 dset.Ests = est
127 dset.Qualities = q
128 cPickle.dump(dset,open(filename,'w+'))
129 else:
131 s = dset.Sequences
132 a = dset.Acceptor
133 d = dset.Donors
134 e = dset.Exons
135 est = dset.Ests
136 q = dset.Qualities
138 return s,a,d,e,est,q
142 if __name__ == '__main__':
143 Sequences, Acceptors, Donors, Exons, Ests, Qualities = generateData(10)
144 print Acceptors
145 print Donors
146 print Exons
147 print Ests
148 print Qualities