cb9565d6b341befccfbfb3a11d70d8eaaf9d058e
[qpalma.git] / python / paths_load_data_solexa.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import mat,zeros,ones,inf
5 import cPickle
6 import pdb
7
8 def paths_load_data_solexa(expt,genome_info,PAR):
9 """
10
11 """
12
13 # expt can be 'training','validation' or 'test'
14 assert expt in ['training','validation','test']
15
16 dna_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/tools/data_tools/allGenes.pickle'
17 est_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/tools/data_tools/filtered_reads'
18 tair7_seq_filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/tair7_seq.pickle'
19
20 tair7_seq = cPickle.load(open(tair7_seq_filename))
21 allGenes = cPickle.load(open(dna_filename))
22
23 Sequences = []
24 Acceptors = []
25 Donors = []
26 Exons = []
27 Ests = []
28 Qualities = []
29
30 for line in open(est_filename):
31 line = line.strip()
32 chr,strand,seq, splitpos, length, prb, cal, chastity, gene_id = line.split()
33 splitpos = int(splitpos)
34 length = int(length)
35 prb = [ord(elem)-50 for elem in prb]
36 cal = [ord(elem)-64 for elem in cal]
37 chastity = [ord(elem)+10 for elem in chastity]
38
39 assert len(prb) == len(seq)
40
41 currentGene = allGenes[gene_id]
42
43 #seq = seq.lower().replace('n','a')
44 seq = seq.lower()#.replace('n','a')
45
46 try:
47 currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
48 Sequences.append(currentSeq)
49 except:
50 continue
51
52 currentSize = len(Sequences[-1])
53 #assert currentSize == len(Sequences[-1]), 'gene_size %d / seq size %d' % (currentSize,len(Sequences[-1]))
54 #Acceptors.append([-inf]*currentSize)
55 #Donors.append([-inf]*currentSize)
56 Acceptors.append([0]*currentSize)
57 Donors.append([0]*currentSize)
58
59 currentExons = zeros((len(currentGene.exons),2))
60 for idx in range(len(currentGene.exons)):
61 currentExons[idx,0] = currentGene.exons[idx][0]-currentGene.start
62 currentExons[idx,1] = currentGene.exons[idx][1]-currentGene.start
63 Exons.append(currentExons)
64 Ests.append(seq)
65 Qualities.append(prb)
66
67 if len(Sequences[-1]) == 2755:
68 Sequences = Sequences[:-1]
69 Acceptors = Acceptors[:-1]
70 Donors = Donors[:-1]
71 Exons = Exons[:-1]
72 Ests = Ests[:-1]
73 Qualities = Qualities[:-1]
74
75 print 'found %d examples' % len(Sequences)
76
77 return Sequences, Acceptors, Donors, Exons, Ests, Qualities