+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from numpy.matlib import mat,zeros,ones,inf
-import cPickle
-import pdb
-#import pydb
-import Configuration as Conf
-import tools
-from tools.PyGff import *
-import io_pickle
-#import scipy.io
-import sys
-
-from genome_utils import load_genomic
-from Genefinding import *
-
-def paths_load_data(filename):
-
- #data = io_pickle.load(filename)
- data = cPickle.load(open(filename))
-
- SeqInfo = data[0]
- Exons = data[1]
- OriginalEsts= data[2]
- Qualities = data[3]
- AlternativeSequences = data[4]
-
- return SeqInfo, Exons, OriginalEsts, Qualities,\
- AlternativeSequences
-
-
-def paths_load_data_solexa(expt,genome_info,PAR,test=False):
- """
-
- """
- # expt can be 'training','validation' or 'test'
- assert expt in ['training','validation','test']
-
- dna_filename = Conf.dna_filename
- est_filename = Conf.est_filename
-
- pdb.set_trace()
-
- allGenes = cPickle.load(open(dna_filename))
-
- Sequences = []
- Acceptors = []
- Donors = []
- Exons = []
- Ests = []
- Qualities = []
- SplitPositions = []
-
- rrp = RemappedReadParser('est_filename')
-
- # Iterate over all reads
- for line in open(est_filename):
- line = line.strip()
- #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
- #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
- id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
-
- splitpos = int(splitpos)
- length = int(length)
- prb = [ord(elem)-50 for elem in prb]
- cal = [ord(elem)-64 for elem in cal]
- chastity = [ord(elem)+10 for elem in chastity]
-
- assert len(prb) == len(seq)
-
- currentGene = allGenes[gene_id]
- seq = seq.lower()
-
- #try:
- # currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
- #except:
- # continue
-
- genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
- chrom = 'chr1'
-
- # use matlab-style functions to access dna sequence
- dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
- currentSeq = dna
-
- if len(currentSeq) < (currentGene.stop - currentGene.start):
- continue
-
- # compare both sequences
- #assert dna == currentSeq, pdb.set_trace()
-
- p_start = int(p_start)
- exon_stop = int(exon_stop)
- exon_start = int(exon_start)
- p_stop = int(p_stop)
-
- assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
- assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
- assert p_stop - p_start >= 36
-
- currentExons = zeros((2,2))
-
- currentExons[0,0] = p_start
- currentExons[0,1] = exon_stop
-
- currentExons[1,0] = exon_start
- currentExons[1,1] = p_stop
-
- # make indices relative to start pos of the current gene
- currentExons -= currentGene.start
-
- # determine cutting positions for sequences
- up_cut = int(currentExons[0,0])
- down_cut = int(currentExons[1,1])
-
- # if we perform testing we cut a much wider region as we want to detect
- # how good we perform on a region
- if test:
- up_cut = up_cut-500
- if up_cut < 0:
- up_cut = 0
-
- down_cut = down_cut+500
-
- if down_cut > len(currentSeq):
- down_cut = len(currentSeq)
-
- # cut out part of the sequence
- currentSeq = currentSeq[up_cut:down_cut]
-
- # ensure that index is correct
- currentExons[0,1] += 1
-
- if test:
- currentExons -= up_cut
- else:
- currentExons -= currentExons[0,0]
-
- try:
- if not (currentSeq[int(currentExons[0,1])] == 'g' and\
- currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
- continue
-
- if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
- continue
-
- except:
- pdb.set_trace()
-
- Sequences.append(currentSeq)
-
- # now we want to use interval_query to get the predicted splice scores
- # trained on the TAIR sequence and annotation
-
-
- interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
- num_fields = 1
- num_intervals = 1
-
- # fetch acceptor scores
- fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+'
- gf = Genef()
- num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
- assert num_hits <= len(currentSeq)
-
- #print 'Acceptor hits: %d' % num_hits
-
- pos = createIntArrayFromList([0]*num_hits)
- indices = createIntArrayFromList([0]*num_hits)
- scores = createDoubleArrayFromList([0]*num_hits*num_fields)
- gf.getResults(pos,indices,scores)
-
- currentAcceptors = [-inf]*(len(currentSeq)+1)
-
- for i in range(num_hits):
- position = pos[i]
- position -= (currentGene.start+up_cut)
- assert 0 <= position < len(currentSeq)+1, 'position index wrong'
- currentAcceptors[position] = scores[i]
-
- currentAcceptors = currentAcceptors[1:] + [-inf]
- Acceptors.append(currentAcceptors)
-
- del gf
-
- # fetch donor scores
- fname = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+'
- gf = Genef()
- num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
- assert num_hits <= len(currentSeq)
-
- #print 'Donor hits: %d' % num_hits
- pos = createIntArrayFromList([0]*num_hits)
- indices = createIntArrayFromList([0]*num_hits)
- scores = createDoubleArrayFromList([0]*num_hits*num_fields)
- gf.getResults(pos,indices,scores)
-
- currentDonors = [-inf]*(len(currentSeq))
-
- try:
- for i in range(1,num_hits):
- position = pos[i]
- position -= (currentGene.start+up_cut)
- currentDonors[position-1] = scores[i]
- except:
- pdb.set_trace()
-
- currentDonors = [-inf] + currentDonors[:-1]
- Donors.append(currentDonors)
-
- Exons.append(currentExons)
- Ests.append(seq)
- Qualities.append(prb)
-
- SplitPositions.append(splitpos)
-
- #if len(Sequences[-1]) == 2755:
- # Sequences = Sequences[:-1]
- # Acceptors = Acceptors[:-1]
- # Donors = Donors[:-1]
- # Exons = Exons[:-1]
- # Ests = Ests[:-1]
- # Qualities = Qualities[:-1]
- # SplitPositions = SplitPositions[:-1]
-
- #print 'found %d examples' % len(Sequences)
-
- return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
-
-
-def paths_load_data_pickle(expt,genome_info,PAR):
- # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
- # Load the relevant file and return the alignment data
-
- # expt can be 'training','validation' or 'test'
-
- assert expt in ['training','validation','test']
-
- tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
-
- Noises = [];
-
- if expt == 'training':
- if PAR.microexon:
- if PAR.LOCAL_ALIGN: # local version
-
- train_data = '%s/microexon_train_data.pickle' % genome_info.basedir
- data = cPickle.load(open(train_data))
-
- else: # global version
- pass
-
-
- else:
- train_data = '%s/exons_train_local.pickle' % genome_info.basedir
- data = cPickle.load(open(train_data))
-
- #print 'train_data is %s' % train_data
-
- Sequences = data['Train'] # dna sequences
- Acceptors = data['TrainAcc'] # acceptor scores
- Donors = data['TrainDon'] # donor scores
- Exons = data['TrainExon'] # exon boundaries
- Ests = data['TrainEsts'] # est sequences
-
- # Lower all indices by one to convert matlab
- # to python indices
-
- Exons -= 1
-
- return Sequences, Acceptors, Donors, Exons, Ests, Noises
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-###########################################################
-#
-###########################################################
-
-import bz2
-
-def writeStruct(fid,plif):
- fid.write('%s_limits=%s\n'%(plif.name,str(plif.limits)))
- fid.write('%s_penalties=%s\n'%(plif.name,str(plif.penalties)))
- fid.write('%s_bins=%d\n'%(plif.name,len(plif.limits)))
-
- if plif.name == 'intron':
- fid.write('%s_len_limits=%s\n'%(plif.name,str(plif.limits)))
- fid.write('%s_len_penalties=%s\n'%(plif.name,str(plif.penalties)))
- fid.write('%s_len_bins=%d\n'%(plif.name,len(plif.limits)))
- fid.write('%s_len_min=%d\n'%(plif.name,plif.min_len))
- fid.write('%s_len_max=%d\n'%(plif.name,plif.max_len))
- fid.write('%s_len_transform=%s\n'%(plif.name,plif.transform))
-
-def export_param(filename,h,d,a,mmatrix,qualityPlifs):
-
- # Exports a bz2 file with the trained PALMA. Assumes splice sites and intron length used.
- h.name = 'intron'
- d.name = 'donor'
- a.name = 'acceptor'
-
- fid = bz2.BZ2File(filename+'.bz2','w')
-
- fid.write('%palma definition file version: 1.0\n\n')
- fid.write('%penalties\n');
-
- writeStruct(fid, h);
- writeStruct(fid, a);
- writeStruct(fid, d);
-
- # substitution matrix
- mmatrix = mmatrix.reshape(6,6)
- fid.write('substitution_matrix=[')
- for row in range(6):
- if row == 5:
- fid.write('%f, %f, %f, %f, %f, %f]\n'%(mmatrix[row,0],mmatrix[row,1],mmatrix[row,2],mmatrix[row,3],mmatrix[row,4],mmatrix[row,5]))
- else:
- fid.write('%f, %f, %f, %f, %f, %f;\n'%(mmatrix[row,0],mmatrix[row,1],mmatrix[row,2],mmatrix[row,3],mmatrix[row,4],mmatrix[row,5]))
-
-
- for elem in qualityPlifs:
- writeStruct(fid, elem);
-
- fid.close()
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from numpy.matlib import mat,zeros,ones,inf
-import random
-import pdb
-import os.path
-import cPickle
-
-class Dataset:
- pass
-
-def sample(population, k):
- """Chooses k random elements from a population sequence. """
- n = len(population)
- result = [None] * k
- for i in xrange(k):
- j = int(random.random() * n)
- result[i] = population[j]
- return result
-
-def generateData(numExamples):
- dna_len = 216
- est_len = 36
- random.seed(14)
-
- letters = ['a','c','g','t']
-
- Sequences = []
- Acceptors = []
- Donors = []
- Exons = []
- Ests = []
-
- for i in range(numExamples):
- dna = ''.join(sample(letters, dna_len))
- Sequences.append(dna)
-
- Acceptors.append([0.0]*dna_len)
- Donors.append([0.0]*dna_len)
-
- currentExon = zeros((2,2))
- currentExon[0,0] = 0
- currentExon[0,1] = 72
- currentExon[1,0] = 144
- currentExon[1,1] = 216
-
- Exons.append(currentExon)
-
- est = ''.join(sample(letters, est_len))
- Ests.append(est)
-
- preNr = 15
- middleNr = 6
- sufNr = 15
- Qualities = [[40]*preNr + [-1]*middleNr + [40]*sufNr]*numExamples
-
- return Sequences, Acceptors, Donors, Exons, Ests, Qualities
-
-def generateData2(numExamples):
- est_len = 36
- random.seed(14)
-
- letters = ['a','c','g','t']
-
- Sequences = []
- Acceptors = []
- Donors = []
- Exons = []
- Ests = []
-
- for i in range(numExamples):
- dna_len = random.randint(200,400)
- dna = ''.join(sample(letters, dna_len))
- #Sequences.append(dna)
- begin = random.randint(70,dna_len-70)
- end = begin+60
- dna = dna[0:begin] + 't'*18 + 'gt' + 'a'*20 + 'ag' + 't'*18 + dna[end:]
- dna_len = len(dna)
- Sequences.append(dna)
-
- currentDon = [-3.0]*dna_len
- currentAcc = [-3.0]*dna_len
-
- currentDon[begin+18+1] = 3.0
- currentDon[begin+18+2] = 3.0
-
- currentAcc[end-18-1] = 3.0
- currentAcc[end-18-2] = 3.0
-
- #pdb.set_trace()
-
- Donors.append(currentDon)
- Acceptors.append(currentAcc)
-
- currentExon = zeros((2,2))
- currentExon[0,0] = 0
- currentExon[0,1] = begin+18
- currentExon[1,0] = end-18
- currentExon[1,1] = dna_len-1
-
- Exons.append(currentExon)
-
- #est = ''.join(sample(letters, est_len))
- est = dna[begin-18:begin] + dna[end+1:end+19]
- est = 't'*est_len
- Ests.append(est)
-
- preNr = 15
- middleNr = 6
- sufNr = 15
- #Qualities = [[40]*preNr + [-1]*middleNr + [40]*sufNr]*numExamples
- Qualities = [[40]*est_len]*numExamples
-
- return Sequences, Acceptors, Donors, Exons, Ests, Qualities
-
-def loadArtificialData(numExamples):
- filename = 'artificial_dset_%d'%numExamples
- if not os.path.exists(filename):
- s,a,d,e,est,q = generateData2(numExamples)
- dset = Dataset()
- dset.Sequences = s
- dset.Acceptor = a
- dset.Donors = d
- dset.Exons = e
- dset.Ests = est
- dset.Qualities = q
- cPickle.dump(dset,open(filename,'w+'))
- else:
- dset = cPickle.load(open(filename))
- s = dset.Sequences
- a = dset.Acceptor
- d = dset.Donors
- e = dset.Exons
- est = dset.Ests
- q = dset.Qualities
-
- return s,a,d,e,est,q
-
-if __name__ == '__main__':
- Sequences, Acceptors, Donors, Exons, Ests, Qualities = generateData(10)
- print Acceptors
- print Donors
- print Exons
- print Ests
- print Qualities