+ added small program for intron-position comparison
[qpalma.git] / qpalma / Configuration.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import os.path
5 import cPickle
6
7 #
8 # choose a path where all results of the QPalma pipeline will be stored
9 #
10
11 result_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run'
12
13
14 ###############################################################################
15 # Load a random but fixed initial parameter vector this makes debugging easier
16 ###############################################################################
17
18 #fixedParamQ = cPickle.load(open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/randInitParam.pickle'))
19
20 ###############################################################################
21 #
22 # The parameters for the QPalma algorithm
23 #
24 #min_intron_len = 20
25 #max_intron_len = 2000
26 #
27 #min_svm_score = 0.0
28 #max_svm_score = 1.0
29 #
30 #min_qual = -5
31 #max_qual = 40
32
33 numConstraintsPerRound = 50
34
35 ###############################################################################
36 #
37 # CHOOSING THE MODE
38 #
39 # 'normal' means work like Palma 'using_quality_scores' means work like Palma
40 # plus using sequencing quality scores
41 #
42 ###############################################################################
43
44 #mode = 'normal'
45 mode = 'using_quality_scores'
46
47 ###############################################################################
48 #
49 # When using quality scores our scoring function is defined as
50 #
51 # f: S_e x R x S -> R, where S_e is {A,C,G,T,N} and S = {A,C,G,T,N,-}
52 #
53 # as opposed to a usage without quality scores when we only have
54 #
55 # f: S x S -> R
56 #
57 # The matrix of plifs is defined as follows:
58 #
59 # elem | - a c g t n
60 # -------------------------
61 # idx | 0 1 2 3 4 5
62 #
63 # dna
64 #
65 # - a c g t n
66 # a
67 # est c
68 # g
69 # t
70 # n
71 #
72 # so the index can be calculated as (estnum-1)*6 + dnanum.
73 # Ests do not have gaps with quality scores so we look up the matchmatrix
74 # instead.
75 ###############################################################################
76
77 read_size = 36
78 extension = (250,500)
79
80 numLengthSuppPoints = 10
81 numDonSuppPoints = 10
82 numAccSuppPoints = 10
83 numQualSuppPoints = 10
84
85 if mode == 'normal':
86 sizeMatchmatrix = (6,6)
87 estPlifs = 0
88 dnaPlifs = 0
89 numQualPlifs = estPlifs*dnaPlifs
90 elif mode == 'using_quality_scores':
91 sizeMatchmatrix = (6,1)
92 estPlifs = 5
93 dnaPlifs = 6
94 numQualPlifs = estPlifs*dnaPlifs
95 else:
96 assert False, 'Wrong operation mode specified'
97
98 ###############################################################################
99 #
100 # GENERAL SETTINGS CONCERNING THE SOLVER
101 #
102 ###############################################################################
103
104 iter_steps = 40
105 remove_duplicate_scores = False
106 print_matrix = False
107 anzpath = 2
108
109 if mode == 'normal':
110 #fixedParam = fixedParamQ
111 fixedParam = None
112 elif mode == 'using_quality_scores':
113 fixedParam = None
114 else:
115 assert False, 'Wrong operation mode specified'
116
117 ###############################################################################
118 #
119 # DATA SETTINGS CONCERNING THE SPLITS AND FILE LOCATIONS
120 #
121 ###############################################################################
122 training_begin = 0
123 training_end = 10000
124
125 prediction_begin = 10000
126 prediction_end = 40000
127
128 joinp = os.path.join
129
130 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp/solexa_tmp'
131 data_path = '/fml/ag-raetsch/share/projects/qpalma/solexa'
132
133 original_path = joinp(data_path,'original_solexa_data')
134 annot_path = joinp(data_path,'annotation_data')
135 remapped_path = joinp(data_path,'remapped_solexa_data')
136
137 dna_flat_fn = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
138
139 gff_fn = joinp(annot_path,'TAIR7_GFF3_genes_Chr%s.gff_v1')
140
141 ###############################################################################
142 #
143 # SANITY CHECKS
144 #
145 ###############################################################################
146 #assert numQualPlifs >= 0
147 #assert numDonSuppPoints > 1
148 #assert numAccSuppPoints > 1
149 #assert numLengthSuppPoints > 1
150 #assert numQualSuppPoints > 1
151 assert os.path.exists(dna_flat_fn), 'DNA data does not exist!'
152
153 extended_alphabet = ['-','a','c','g','t','n','[',']']
154 alphabet = ['-','a','c','g','t','n']
155
156 ###############################################################################
157 #
158 # Settings for the VMatch pipeline steps
159 #
160 ###############################################################################
161
162 reads_location = ''
163
164 #
165 # First VMatch step
166 #
167
168 mismatches_1 = 2
169 end_gap_1 = 0
170 repeat_mapping_1 = 1
171 seedlength_1 = 9
172 suffixtree_1 = '/media/oka_raid/nobackup/data/Vmatch/ATH/ATH1.v5.seed9.fa'
173
174 #
175 # Second VMatch step
176 #
177
178 mismatches_2 = 1
179 sub_mismatches_2 = 1
180 min_short_end_2 = 14
181 repeat_mapping_2 = 1
182 seedlength_2 = 9
183 suffixtree_2 = '/media/oka_raid/nobackup/data/Vmatch/ATH/ATH1.v5.seed9.fa'
184
185 #
186 #
187 # do not modify anything below this line
188 #
189 #
190
191 conf_object_path = os.path.join(result_dir,'config_object.pickle')