+ minor changes
[qpalma.git] / qpalma / Configuration.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import numpy.matlib
5 import os.path
6 import cPickle
7
8 ###############################################################################
9 #
10 # Load a random but fixed initial parameter vector this makes debugging easier
11 #
12 ###############################################################################
13
14 fixedParamQ = cPickle.load(open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/randInitParam.pickle'))
15
16 ###########################################################
17 #
18 # The parameters for the QPalma algorithm
19 #
20 #
21 C = 1
22
23
24
25 ###############################################################################
26 #
27 # CHOOSING THE MODE
28 #
29 # 'normal' means work like Palma 'using_quality_scores' means work like Palma
30 # plus using sequencing quality scores
31 #
32 ###############################################################################
33
34 #mode = 'normal'
35 mode = 'using_quality_scores'
36
37 ###############################################################################
38 #
39 # When using quality scores our scoring function is defined as
40 #
41 # f: S_e x R x S -> R
42 #
43 # where S_e is {A,C,G,T,N} and S = {A,C,G,T,N,-}
44 #
45 # as opposed to a usage without quality scores when we only have
46 #
47 # f: S x S -> R
48 #
49 # The matrix of plifs is defined as follows:
50 #
51 # elem | - a c g t n
52 # -------------------------
53 # idx | 0 1 2 3 4 5
54 #
55 #
56 # dna
57 #
58 # - a c g t n
59 # a
60 # est c
61 # g
62 # t
63 # n
64 #
65 # so the index can be calculated as (estnum-1)*6 + dnanum.
66 #
67 # At ests we do not have gaps with quality scores so we look up the matchmatrix
68 # instead.
69 ###############################################################################
70
71 numDonSuppPoints = 30
72 numAccSuppPoints = 30
73 numLengthSuppPoints = 30
74 numQualSuppPoints = 16
75
76 min_qual = -1
77 max_qual = 40
78
79 USE_OPT = True
80
81 if mode == 'normal':
82 sizeMatchmatrix = (6,6)
83 estPlifs = 0
84 dnaPlifs = 0
85 numQualPlifs = estPlifs*dnaPlifs
86 elif mode == 'using_quality_scores':
87 sizeMatchmatrix = (6,1)
88 estPlifs = 5
89 dnaPlifs = 6
90 numQualPlifs = estPlifs*dnaPlifs
91 else:
92 assert False, 'Wrong operation mode specified'
93
94 totalQualSuppPoints = numQualPlifs*numQualSuppPoints
95
96 numFeatures = numDonSuppPoints + numAccSuppPoints\
97 + numLengthSuppPoints + sizeMatchmatrix[0]*sizeMatchmatrix[1] + totalQualSuppPoints
98
99
100 ###############################################################################
101 #
102 # GENERAL SETTINGS CONCERNING THE SOLVER
103 #
104 #
105 #
106 ###############################################################################
107
108 iter_steps = 40
109 remove_duplicate_scores = False
110 print_matrix = False
111 anzpath = 2
112
113 if mode == 'normal':
114 fixedParam = fixedParam[:numFeatures]
115 elif mode == 'using_quality_scores':
116 fixedParam = fixedParamQ[:numFeatures]
117 else:
118 assert False, 'Wrong operation mode specified'
119
120 ###############################################################################
121 #
122 # DATA SETTINGS CONCERNING THE SPLITS AND FILE LOCATIONS
123 #
124 #
125 #
126 ###############################################################################
127
128 training_begin = 0
129 training_end = 1500
130
131 prediction_begin = 1500
132 prediction_end = 2200
133
134 joinp = os.path.join
135
136 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp/solexa_tmp'
137 data_path = '/fml/ag-raetsch/share/projects/qpalma/solexa'
138
139 original_path = joinp(data_path,'original_solexa_data')
140 annot_path = joinp(data_path,'annotation_data')
141 remapped_path = joinp(data_path,'remapped_solexa_data')
142
143 dna_flat_fn = joinp(data_path,'allGenes.pickle')
144 gff_fn = joinp(annot_path,'TAIR7_GFF3_genes_Chr1.gff_v1')
145 filtered_fn = joinp(data_path,'filteredReads_1_recent')
146 remapped_fn = joinp(remapped_path,'map_best_hit.18.unambig')
147
148 dataset_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/chr1_dataset.pickle'
149
150 ###############################################################################
151 #
152 # SANITY CHECKS
153 #
154 ###############################################################################
155 assert numQualPlifs >= 0
156 assert numDonSuppPoints > 1
157 assert numAccSuppPoints > 1
158 assert numLengthSuppPoints > 1
159 assert numQualSuppPoints > 1
160
161 assert os.path.exists(dna_flat_fn), 'DNA data does not exist!'
162 assert os.path.exists(gff_fn), 'EST/Reads data does not exist!'
163 assert os.path.exists(filtered_fn), 'EST/Reads data does not exist!'
164 assert os.path.exists(remapped_fn), 'EST/Reads data does not exist!'