+ minor bugfixes in the compile_dataset script
[qpalma.git] / qpalma / Configuration.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import numpy.matlib
5 import os.path
6 import cPickle
7
8 ###############################################################################
9 #
10 # Load a random but fixed initial parameter vector this makes debugging easier
11 #
12 ###############################################################################
13
14 fixedParamQ = cPickle.load(open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/randInitParam.pickle'))
15
16 ###########################################################
17 #
18 # The parameters for the QPalma algorithm
19 #
20 #
21 C = 1
22
23 min_intron_len = 10
24 max_intron_len = 1000
25
26 min_svm_score = 0.0
27 max_svm_score = 1.0
28
29 ###############################################################################
30 #
31 # CHOOSING THE MODE
32 #
33 # 'normal' means work like Palma 'using_quality_scores' means work like Palma
34 # plus using sequencing quality scores
35 #
36 ###############################################################################
37
38 #mode = 'normal'
39 mode = 'using_quality_scores'
40
41 ###############################################################################
42 #
43 # When using quality scores our scoring function is defined as
44 #
45 # f: S_e x R x S -> R
46 #
47 # where S_e is {A,C,G,T,N} and S = {A,C,G,T,N,-}
48 #
49 # as opposed to a usage without quality scores when we only have
50 #
51 # f: S x S -> R
52 #
53 # The matrix of plifs is defined as follows:
54 #
55 # elem | - a c g t n
56 # -------------------------
57 # idx | 0 1 2 3 4 5
58 #
59 #
60 # dna
61 #
62 # - a c g t n
63 # a
64 # est c
65 # g
66 # t
67 # n
68 #
69 # so the index can be calculated as (estnum-1)*6 + dnanum.
70 #
71 # At ests we do not have gaps with quality scores so we look up the matchmatrix
72 # instead.
73 ###############################################################################
74
75 numDonSuppPoints = 30
76 numAccSuppPoints = 30
77 numLengthSuppPoints = 30
78 numQualSuppPoints = 16
79
80 min_qual = -1
81 max_qual = 40
82
83 USE_OPT = True
84
85 if mode == 'normal':
86 sizeMatchmatrix = (6,6)
87 estPlifs = 0
88 dnaPlifs = 0
89 numQualPlifs = estPlifs*dnaPlifs
90 elif mode == 'using_quality_scores':
91 sizeMatchmatrix = (6,1)
92 estPlifs = 5
93 dnaPlifs = 6
94 numQualPlifs = estPlifs*dnaPlifs
95 else:
96 assert False, 'Wrong operation mode specified'
97
98 totalQualSuppPoints = numQualPlifs*numQualSuppPoints
99
100 numFeatures = numDonSuppPoints + numAccSuppPoints\
101 + numLengthSuppPoints + sizeMatchmatrix[0]*sizeMatchmatrix[1] + totalQualSuppPoints
102
103
104 ###############################################################################
105 #
106 # GENERAL SETTINGS CONCERNING THE SOLVER
107 #
108 #
109 #
110 ###############################################################################
111
112 iter_steps = 40
113 remove_duplicate_scores = False
114 print_matrix = False
115 anzpath = 2
116
117 if mode == 'normal':
118 fixedParam = fixedParam[:numFeatures]
119 elif mode == 'using_quality_scores':
120 fixedParam = fixedParamQ[:numFeatures]
121 else:
122 assert False, 'Wrong operation mode specified'
123
124 ###############################################################################
125 #
126 # DATA SETTINGS CONCERNING THE SPLITS AND FILE LOCATIONS
127 #
128 #
129 #
130 ###############################################################################
131
132 training_begin = 0
133 training_end = 1500
134
135 prediction_begin = 0
136 prediction_end = 1500
137
138 joinp = os.path.join
139
140 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp/solexa_tmp'
141 data_path = '/fml/ag-raetsch/share/projects/qpalma/solexa'
142
143 original_path = joinp(data_path,'original_solexa_data')
144 annot_path = joinp(data_path,'annotation_data')
145 remapped_path = joinp(data_path,'remapped_solexa_data')
146
147 dna_flat_fn = joinp(data_path,'allGenes.pickle')
148 gff_fn = joinp(annot_path,'TAIR7_GFF3_genes_Chr1.gff_v1')
149 filtered_fn = joinp(data_path,'filteredReads_1_recent')
150 remapped_fn = joinp(remapped_path,'map_best_hit.18.unambig')
151
152 dataset_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/chr1_dataset.pickle'
153
154 ###############################################################################
155 #
156 # SANITY CHECKS
157 #
158 ###############################################################################
159 assert numQualPlifs >= 0
160 assert numDonSuppPoints > 1
161 assert numAccSuppPoints > 1
162 assert numLengthSuppPoints > 1
163 assert numQualSuppPoints > 1
164
165 assert os.path.exists(dna_flat_fn), 'DNA data does not exist!'
166 assert os.path.exists(gff_fn), 'EST/Reads data does not exist!'
167 assert os.path.exists(filtered_fn), 'EST/Reads data does not exist!'
168 assert os.path.exists(remapped_fn), 'EST/Reads data does not exist!'