+ fixed a bug in the filtering (forgot to disable remove_ambiguities)
[qpalma.git] / qpalma / Configuration.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import numpy.matlib
5 import os.path
6 import cPickle
7
8 ###############################################################################
9 #
10 # Load a random but fixed initial parameter vector this makes debugging easier
11 #
12 ###############################################################################
13
14 fixedParamQ = cPickle.load(open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/randInitParam.pickle'))
15
16 ###########################################################
17 #
18 # The parameters for the QPalma algorithm
19 #
20 #
21 C = 100
22
23 min_intron_len = 20
24 max_intron_len = 2000
25
26 min_svm_score = 0.0
27 max_svm_score = 1.0
28
29 numConstraintsPerRound = 50
30
31 ###############################################################################
32 #
33 # CHOOSING THE MODE
34 #
35 # 'normal' means work like Palma 'using_quality_scores' means work like Palma
36 # plus using sequencing quality scores
37 #
38 ###############################################################################
39
40 #mode = 'normal'
41 mode = 'using_quality_scores'
42
43 ###############################################################################
44 #
45 # When using quality scores our scoring function is defined as
46 #
47 # f: S_e x R x S -> R
48 #
49 # where S_e is {A,C,G,T,N} and S = {A,C,G,T,N,-}
50 #
51 # as opposed to a usage without quality scores when we only have
52 #
53 # f: S x S -> R
54 #
55 # The matrix of plifs is defined as follows:
56 #
57 # elem | - a c g t n
58 # -------------------------
59 # idx | 0 1 2 3 4 5
60 #
61 #
62 # dna
63 #
64 # - a c g t n
65 # a
66 # est c
67 # g
68 # t
69 # n
70 #
71 # so the index can be calculated as (estnum-1)*6 + dnanum.
72 #
73 # At ests we do not have gaps with quality scores so we look up the matchmatrix
74 # instead.
75 ###############################################################################
76
77 read_size = 36
78
79 extension = (1,20)
80
81 #numLengthSuppPoints = 30
82 #numDonSuppPoints = 30
83 #numAccSuppPoints = 30
84 #numQualSuppPoints = 16
85
86 #numLengthSuppPoints = 20
87 #numDonSuppPoints = 20
88 #numAccSuppPoints = 20
89 #numQualSuppPoints = 8
90
91 numLengthSuppPoints = 10
92 numDonSuppPoints = 10
93 numAccSuppPoints = 10
94 numQualSuppPoints = 10
95
96 min_qual = -5
97 max_qual = 40
98
99 USE_OPT = True
100
101 if mode == 'normal':
102 sizeMatchmatrix = (6,6)
103 estPlifs = 0
104 dnaPlifs = 0
105 numQualPlifs = estPlifs*dnaPlifs
106 elif mode == 'using_quality_scores':
107 sizeMatchmatrix = (6,1)
108 estPlifs = 5
109 dnaPlifs = 6
110 numQualPlifs = estPlifs*dnaPlifs
111 else:
112 assert False, 'Wrong operation mode specified'
113
114 totalQualSuppPoints = numQualPlifs*numQualSuppPoints
115
116 numFeatures = numDonSuppPoints + numAccSuppPoints\
117 + numLengthSuppPoints + sizeMatchmatrix[0]*sizeMatchmatrix[1] + totalQualSuppPoints
118
119
120 ###############################################################################
121 #
122 # GENERAL SETTINGS CONCERNING THE SOLVER
123 #
124 #
125 #
126 ###############################################################################
127
128 iter_steps = 40
129 remove_duplicate_scores = False
130 print_matrix = False
131 anzpath = 2
132
133 if mode == 'normal':
134 fixedParam = fixedParamQ[:numFeatures]
135 elif mode == 'using_quality_scores':
136 fixedParam = fixedParamQ[:numFeatures]
137 else:
138 assert False, 'Wrong operation mode specified'
139
140 ###############################################################################
141 #
142 # DATA SETTINGS CONCERNING THE SPLITS AND FILE LOCATIONS
143 #
144 #
145 #
146 ###############################################################################
147
148 training_begin = 0
149 training_end = 500
150
151 prediction_begin = 500
152 prediction_end = 2000
153
154 joinp = os.path.join
155
156 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp/solexa_tmp'
157 data_path = '/fml/ag-raetsch/share/projects/qpalma/solexa'
158
159 original_path = joinp(data_path,'original_solexa_data')
160 annot_path = joinp(data_path,'annotation_data')
161 remapped_path = joinp(data_path,'remapped_solexa_data')
162
163 dna_flat_fn = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
164 gff_fn = joinp(annot_path,'TAIR7_GFF3_genes_Chr%s.gff_v1')
165 #filtered_fn = joinp(data_path,'allFilteredReads_07_02_2008')
166 filtered_fn = joinp(data_path,'filteredReads_test')
167 remapped_fn = joinp(remapped_path,'map_best_hit.18.unambig')
168
169 dataset_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/chr1_dataset.pickle'
170
171 ###############################################################################
172 #
173 # SANITY CHECKS
174 #
175 ###############################################################################
176 assert numQualPlifs >= 0
177 assert numDonSuppPoints > 1
178 assert numAccSuppPoints > 1
179 assert numLengthSuppPoints > 1
180 assert numQualSuppPoints > 1
181
182 assert os.path.exists(dna_flat_fn), 'DNA data does not exist!'
183 assert os.path.exists(filtered_fn), 'EST/Reads data does not exist!'
184 assert os.path.exists(remapped_fn), 'EST/Reads data does not exist!'
185
186 extended_alphabet = ['-','a','c','g','t','n','[',']']
187 alphabet = ['-','a','c','g','t','n']