+ renamed main dir in order to create python module hierarchy
[qpalma.git] / qpalma / paths_load_data.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import io_pickle
5 import scipy.io
6 import pdb
7
8 def paths_load_data(expt,genome_info,PAR):
9 """
10
11 """
12
13 # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
14 # Load the relevant file and return the alignment data
15
16 # expt can be 'training','validation' or 'test'
17
18 assert expt in ['training','validation','test']
19
20 tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
21
22 Noises = [];
23
24 if expt == 'training':
25 if PAR.microexon:
26 if PAR.LOCAL_ALIGN: # local version
27
28 train_data = '%s/microexon_train_data_cut_local.mat' % genome_info.basedir
29 train_data = '%s/microexon_train_data.mat' % genome_info.basedir
30 #train_data_pickle = '%s/microexon_train_data_cut_local.pickle'% tmp_dir
31 #io_pickle.convert_v6(train_data,train_data_pickle)
32 #train_data = io_pickle.load(train_data_pickle)
33 data = scipy.io.loadmat(train_data)
34
35 else: # global version
36
37 train_data = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat' %\
38 (genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
39
40 train_data = '%s/microexon_train_data.mat' % genome_info.basedir
41 #train_data_pickle = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.pickle' %\
42 # (tmp_dir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
43
44 #io_pickle.convert_v6(train_data,train_data_pickle)
45 #train_data = io_pickle.load(train_data_pickle)
46 data = scipy.io.loadmat(train_data)
47 Noises = data['TrainNoise'] # substitution matrix
48
49 else:
50 train_data = '%s/exons_train_local.mat' % genome_info.basedir
51 #train_data_pickle = '%s/exons_train_local.pickle'% tmp_dir
52 #io_pickle.convert_v6(train_data,train_data_pickle)
53 #microexon_train_data = io_pickle.load(train_data_pickle)
54 data = scipy.io.loadmat(train_data)
55
56 print 'train_data is %s' % train_data
57
58 Sequences = data['Train'] # dna sequences
59 Acceptors = data['TrainAcc'] # acceptor scores
60 Donors = data['TrainDon'] # donor scores
61 Exons = data['TrainExon'] # exon boundaries
62 Ests = data['TrainEsts'] # est sequences
63
64 #elif expt == 'validation':
65 # print('Loading validation data\n') ;
66 # if PAR.microexon,
67 # if PAR.LOCAL_ALIGN
68 # %local version
69 # load(sprintf('%s/microexon_val_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
70 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
71 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
72 # else
73 # %global version
74 # load(sprintf('%s/microexon_val_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
75 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
76 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
77 # end
78 # else
79 # load(sprintf('%s/exons_val_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
80 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
81 # 'ValEsts', 'ValExon', 'Val', 'ValAcc', 'ValDon') ;
82 # end
83 #
84 # Sequences = Val ; % dna sequences
85 # Acceptors = ValAcc ; % acceptor scores
86 # Donors = ValDon ; % donor scores
87 # Exons = ValExon ; % exon boundaries
88 # Ests = ValEsts ; % est sequences
89
90
91
92 #elif expt == 'test':
93 # fprintf('Loading test data\n') ;
94 # if PAR.microexon,
95 # if PAR.LOCAL_ALIGN
96 # %local version
97 # load(sprintf('%s/microexon_test_data_cut_local_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
98 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
99 # 'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
100 # else
101 # %global version
102 # load(sprintf('%s/microexon_test_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
103 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
104 # 'TestEsts', 'TestExon', 'Test','TestAcc', 'TestDon', 'TestNoise') ;
105 # Noises = TestNoise ; % substitution matrix
106 # end
107 # else
108 # load(sprintf('%s/exons_test_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat', ...
109 # genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob), ...
110 # 'TestEsts', 'TestExon', 'Test', 'TestAcc', 'TestDon') ;
111 # end
112 #
113 # Sequences = Test ; % dna sequences
114 # Acceptors = TestAcc ; % acceptor scores
115 # Donors = TestDon ; % donor scores
116 # Exons = TestExon ; % exon boundaries
117 # Ests = TestEsts ; % est sequences
118
119 # Lower all indices by one to convert matlab
120 # to python indices
121
122 Exons -= 1
123
124 return Sequences, Acceptors, Donors, Exons, Ests, Noises