+ added configuration variables to main config file
[qpalma.git] / qpalma / Configuration.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import numpy.matlib
5 import os.path
6 import cPickle
7
8 #
9 # choose a path where all results of the QPalma pipeline will be stored
10 #
11
12 result_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run'
13
14
15 ###############################################################################
16 # Load a random but fixed initial parameter vector this makes debugging easier
17 ###############################################################################
18
19 #fixedParamQ = cPickle.load(open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/randInitParam.pickle'))
20
21 ###############################################################################
22 #
23 # The parameters for the QPalma algorithm
24 #
25 #min_intron_len = 20
26 #max_intron_len = 2000
27 #
28 #min_svm_score = 0.0
29 #max_svm_score = 1.0
30 #
31 #min_qual = -5
32 #max_qual = 40
33
34 numConstraintsPerRound = 50
35
36 ###############################################################################
37 #
38 # CHOOSING THE MODE
39 #
40 # 'normal' means work like Palma 'using_quality_scores' means work like Palma
41 # plus using sequencing quality scores
42 #
43 ###############################################################################
44
45 #mode = 'normal'
46 mode = 'using_quality_scores'
47
48 ###############################################################################
49 #
50 # When using quality scores our scoring function is defined as
51 #
52 # f: S_e x R x S -> R, where S_e is {A,C,G,T,N} and S = {A,C,G,T,N,-}
53 #
54 # as opposed to a usage without quality scores when we only have
55 #
56 # f: S x S -> R
57 #
58 # The matrix of plifs is defined as follows:
59 #
60 # elem | - a c g t n
61 # -------------------------
62 # idx | 0 1 2 3 4 5
63 #
64 # dna
65 #
66 # - a c g t n
67 # a
68 # est c
69 # g
70 # t
71 # n
72 #
73 # so the index can be calculated as (estnum-1)*6 + dnanum.
74 # Ests do not have gaps with quality scores so we look up the matchmatrix
75 # instead.
76 ###############################################################################
77
78 read_size = 36
79 extension = (250,500)
80
81 numLengthSuppPoints = 10
82 numDonSuppPoints = 10
83 numAccSuppPoints = 10
84 numQualSuppPoints = 10
85
86 if mode == 'normal':
87 sizeMatchmatrix = (6,6)
88 estPlifs = 0
89 dnaPlifs = 0
90 numQualPlifs = estPlifs*dnaPlifs
91 elif mode == 'using_quality_scores':
92 sizeMatchmatrix = (6,1)
93 estPlifs = 5
94 dnaPlifs = 6
95 numQualPlifs = estPlifs*dnaPlifs
96 else:
97 assert False, 'Wrong operation mode specified'
98
99 ###############################################################################
100 #
101 # GENERAL SETTINGS CONCERNING THE SOLVER
102 #
103 ###############################################################################
104
105 iter_steps = 40
106 remove_duplicate_scores = False
107 print_matrix = False
108 anzpath = 2
109
110 if mode == 'normal':
111 #fixedParam = fixedParamQ
112 fixedParam = None
113 elif mode == 'using_quality_scores':
114 fixedParam = None
115 else:
116 assert False, 'Wrong operation mode specified'
117
118 ###############################################################################
119 #
120 # DATA SETTINGS CONCERNING THE SPLITS AND FILE LOCATIONS
121 #
122 ###############################################################################
123 training_begin = 0
124 training_end = 10000
125
126 prediction_begin = 10000
127 prediction_end = 40000
128
129 joinp = os.path.join
130
131 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp/solexa_tmp'
132 data_path = '/fml/ag-raetsch/share/projects/qpalma/solexa'
133
134 original_path = joinp(data_path,'original_solexa_data')
135 annot_path = joinp(data_path,'annotation_data')
136 remapped_path = joinp(data_path,'remapped_solexa_data')
137
138 dna_flat_fn = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
139
140 gff_fn = joinp(annot_path,'TAIR7_GFF3_genes_Chr%s.gff_v1')
141
142 ###############################################################################
143 #
144 # SANITY CHECKS
145 #
146 ###############################################################################
147 #assert numQualPlifs >= 0
148 #assert numDonSuppPoints > 1
149 #assert numAccSuppPoints > 1
150 #assert numLengthSuppPoints > 1
151 #assert numQualSuppPoints > 1
152 assert os.path.exists(dna_flat_fn), 'DNA data does not exist!'
153
154 extended_alphabet = ['-','a','c','g','t','n','[',']']
155 alphabet = ['-','a','c','g','t','n']
156
157 ###############################################################################
158 #
159 # Settings for the VMatch pipeline steps
160 #
161 ###############################################################################
162
163 reads_location = ''
164
165 #
166 # First VMatch step
167 #
168
169 mismatches_1 = 2
170 end_gap_1 = 0
171 repeat_mapping_1 = 1
172 seedlength_1 = 9
173 suffixtree_1 = '/media/oka_raid/nobackup/data/Vmatch/ATH/ATH1.v5.seed9.fa'
174
175 #
176 # Second VMatch step
177 #
178
179 mismatches_2 = 1
180 sub_mismatches_2 = 1
181 min_short_end_2 = 14
182 repeat_mapping_2 = 1
183 seedlength_2 = 9
184 suffixtree_2 = '/media/oka_raid/nobackup/data/Vmatch/ATH/ATH1.v5.seed9.fa'
185
186 #
187 #
188 # do not modify anything below this line
189 #
190 #
191
192 conf_object_path = os.path.join(result_dir,'config_object.pickle')