+ renamed main dir in order to create python module hierarchy
[qpalma.git] / qpalma / paths_load_data_solexa.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from numpy.matlib import mat,zeros,ones,inf
5 import cPickle
6 import pdb
7
8 def paths_load_data_solexa(expt,genome_info,PAR):
9 # expt can be 'training','validation' or 'test'
10 assert expt in ['training','validation','test']
11
12 dna_filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/allGenes.pickle'
13 est_filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/filtered_reads'
14 tair7_seq_filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/tair7_seq.pickle'
15
16 tair7_seq = cPickle.load(open(tair7_seq_filename))
17 allGenes = cPickle.load(open(dna_filename))
18
19 Sequences = []
20 Acceptors = []
21 Donors = []
22 Exons = []
23 Ests = []
24 Qualities = []
25
26 for line in open(est_filename):
27 line = line.strip()
28 chr,strand,seq, splitpos, length, prb, cal, chastity, gene_id = line.split()
29 splitpos = int(splitpos)
30 length = int(length)
31 prb = [ord(elem)-50 for elem in prb]
32 cal = [ord(elem)-64 for elem in cal]
33 chastity = [ord(elem)+10 for elem in chastity]
34
35 assert len(prb) == len(seq)
36
37 currentGene = allGenes[gene_id]
38 seq = seq.lower()
39
40 try:
41 currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
42 Sequences.append(currentSeq)
43 except:
44 continue
45
46 currentSize = len(Sequences[-1])
47 #assert currentSize == len(Sequences[-1]), 'gene_size %d / seq size %d' % (currentSize,len(Sequences[-1]))
48 #Acceptors.append([-inf]*currentSize)
49 #Donors.append([-inf]*currentSize)
50 Acceptors.append([0]*currentSize)
51 Donors.append([0]*currentSize)
52
53 currentExons = zeros((len(currentGene.exons),2))
54 for idx in range(len(currentGene.exons)):
55 currentExons[idx,0] = currentGene.exons[idx][0]-currentGene.start
56 currentExons[idx,1] = currentGene.exons[idx][1]-currentGene.start
57 Exons.append(currentExons)
58 Ests.append(seq)
59 Qualities.append(prb)
60
61 if len(Sequences[-1]) == 2755:
62 Sequences = Sequences[:-1]
63 Acceptors = Acceptors[:-1]
64 Donors = Donors[:-1]
65 Exons = Exons[:-1]
66 Ests = Ests[:-1]
67 Qualities = Qualities[:-1]
68
69 print 'found %d examples' % len(Sequences)
70
71 return Sequences, Acceptors, Donors, Exons, Ests, Qualities