+ changes in the main configuration -> no pickled init vector necessary anymore
[qpalma.git] / qpalma / Configuration.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import numpy.matlib
5 import os.path
6 import cPickle
7
8 ###############################################################################
9 # Load a random but fixed initial parameter vector this makes debugging easier
10 ###############################################################################
11
12 #fixedParamQ = cPickle.load(open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/randInitParam.pickle'))
13
14 ###############################################################################
15 #
16 # The parameters for the QPalma algorithm
17 #
18 #min_intron_len = 20
19 #max_intron_len = 2000
20 #
21 #min_svm_score = 0.0
22 #max_svm_score = 1.0
23 #
24 #min_qual = -5
25 #max_qual = 40
26
27 numConstraintsPerRound = 50
28
29 ###############################################################################
30 #
31 # CHOOSING THE MODE
32 #
33 # 'normal' means work like Palma 'using_quality_scores' means work like Palma
34 # plus using sequencing quality scores
35 #
36 ###############################################################################
37
38 #mode = 'normal'
39 mode = 'using_quality_scores'
40
41 ###############################################################################
42 #
43 # When using quality scores our scoring function is defined as
44 #
45 # f: S_e x R x S -> R, where S_e is {A,C,G,T,N} and S = {A,C,G,T,N,-}
46 #
47 # as opposed to a usage without quality scores when we only have
48 #
49 # f: S x S -> R
50 #
51 # The matrix of plifs is defined as follows:
52 #
53 # elem | - a c g t n
54 # -------------------------
55 # idx | 0 1 2 3 4 5
56 #
57 # dna
58 #
59 # - a c g t n
60 # a
61 # est c
62 # g
63 # t
64 # n
65 #
66 # so the index can be calculated as (estnum-1)*6 + dnanum.
67 # Ests do not have gaps with quality scores so we look up the matchmatrix
68 # instead.
69 ###############################################################################
70
71 read_size = 36
72 extension = (250,500)
73
74 numLengthSuppPoints = 10
75 numDonSuppPoints = 10
76 numAccSuppPoints = 10
77 numQualSuppPoints = 10
78
79 if mode == 'normal':
80 sizeMatchmatrix = (6,6)
81 estPlifs = 0
82 dnaPlifs = 0
83 numQualPlifs = estPlifs*dnaPlifs
84 elif mode == 'using_quality_scores':
85 sizeMatchmatrix = (6,1)
86 estPlifs = 5
87 dnaPlifs = 6
88 numQualPlifs = estPlifs*dnaPlifs
89 else:
90 assert False, 'Wrong operation mode specified'
91
92 ###############################################################################
93 #
94 # GENERAL SETTINGS CONCERNING THE SOLVER
95 #
96 ###############################################################################
97
98 iter_steps = 40
99 remove_duplicate_scores = False
100 print_matrix = False
101 anzpath = 2
102
103 if mode == 'normal':
104 #fixedParam = fixedParamQ
105 fixedParam = None
106 elif mode == 'using_quality_scores':
107 fixedParam = None
108 else:
109 assert False, 'Wrong operation mode specified'
110
111 ###############################################################################
112 #
113 # DATA SETTINGS CONCERNING THE SPLITS AND FILE LOCATIONS
114 #
115 ###############################################################################
116 training_begin = 0
117 training_end = 10000
118
119 prediction_begin = 10000
120 prediction_end = 40000
121
122 joinp = os.path.join
123
124 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp/solexa_tmp'
125 data_path = '/fml/ag-raetsch/share/projects/qpalma/solexa'
126
127 original_path = joinp(data_path,'original_solexa_data')
128 annot_path = joinp(data_path,'annotation_data')
129 remapped_path = joinp(data_path,'remapped_solexa_data')
130
131 dna_flat_fn = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
132
133 gff_fn = joinp(annot_path,'TAIR7_GFF3_genes_Chr%s.gff_v1')
134
135 ###############################################################################
136 #
137 # SANITY CHECKS
138 #
139 ###############################################################################
140 #assert numQualPlifs >= 0
141 #assert numDonSuppPoints > 1
142 #assert numAccSuppPoints > 1
143 #assert numLengthSuppPoints > 1
144 #assert numQualSuppPoints > 1
145 assert os.path.exists(dna_flat_fn), 'DNA data does not exist!'
146
147 extended_alphabet = ['-','a','c','g','t','n','[',']']
148 alphabet = ['-','a','c','g','t','n']