import cPickle
import pdb
import Configuration as Conf
-from tools.PyGff import *
+from PyGff import *
import io_pickle
import scipy.io
import pdb
Exons = []
Ests = []
Qualities = []
+ SplitPositions = []
for line in open(est_filename):
line = line.strip()
#Acceptors.append([-inf]*currentSize)
#Donors.append([-inf]*currentSize)
- exon_idx = int(exon_idx)
- currentExons = zeros((len(currentGene.exons),2))
+ exon_idx = int(exon_idx)-1
+ currentExons = zeros((2,2))
#for idx in range(len(currentGene.exons)):
# currentExons[idx,0] = currentGene.exons[idx][0]-currentGene.start
# currentExons[idx,1] = currentGene.exons[idx][1]-currentGene.start
- currentExons[0,0] = currentGene.exons[exon_idx-1][0]-currentGene.start
- currentExons[0,1] = currentGene.exons[exon_idx-1][1]-currentGene.start
+ try:
+ currentExons[0,0] = currentGene.exons[exon_idx-1][0]-currentGene.start
+ currentExons[0,1] = currentGene.exons[exon_idx-1][1]-currentGene.start
+
+ currentExons[1,0] = currentGene.exons[exon_idx][0]-currentGene.start
+ currentExons[1,1] = currentGene.exons[exon_idx][1]-currentGene.start
- currentExons[1,0] = currentGene.exons[exon_idx][0]-currentGene.start
- currentExons[1,1] = currentGene.exons[exon_idx][1]-currentGene.start
+ cut_offset = 500
- cut_offset = 500
- up_cut = currentExons[0,0] - cut_offset
- down_cut = currentExons[1,1] + cut_offset
+ up_cut = int(currentExons[0,0]) - cut_offset
+ down_cut = int(currentExons[1,1]) + cut_offset
- if up_cut < 0:
- up_cut = 0
+ if up_cut < 0:
+ up_cut = 0
- if down_cut > len(currentSeq):
- down_cut = len(currentSeq)
+ if down_cut > len(currentSeq):
+ down_cut = len(currentSeq)
- currentSeq = currentSeq[up_cut:down_cut]
+ currentSeq = currentSeq[up_cut:down_cut]
+
+ except:
+ pdb.set_trace()
Sequences.append(currentSeq)
print 'found %d examples' % len(Sequences)
- return Sequences, Acceptors, Donors, Exons, Ests, Qualities
+ return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
def paths_load_data_pickle(expt,genome_info,PAR):
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from numpy.matlib import mat,zeros,ones,inf
-import cPickle
-import pdb
-import Configuration as Conf
-from tools.PyGff import *
-
-
-def paths_load_data_solexa(expt,genome_info,PAR):
- # expt can be 'training','validation' or 'test'
- assert expt in ['training','validation','test']
-
- dna_filename = Conf.dna_filename
- est_filename = Conf.est_filename
- tair7_seq_filename = Conf.tair7_seq_filename
-
- tair7_seq = cPickle.load(open(tair7_seq_filename))
- allGenes = cPickle.load(open(dna_filename))
-
- Sequences = []
- Acceptors = []
- Donors = []
- Exons = []
- Ests = []
- Qualities = []
-
- for line in open(est_filename):
- line = line.strip()
- chr,strand,seq, splitpos, length, prb, cal, chastity, gene_id, exon_idx = line.split()
- splitpos = int(splitpos)
- length = int(length)
- prb = [ord(elem)-50 for elem in prb]
- cal = [ord(elem)-64 for elem in cal]
- chastity = [ord(elem)+10 for elem in chastity]
-
- assert len(prb) == len(seq)
-
- currentGene = allGenes[gene_id]
- seq = seq.lower()
-
- try:
- currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
- except:
- continue
-
- #assert currentSize == len(Sequences[-1]), 'gene_size %d / seq size %d' % (currentSize,len(Sequences[-1]))
- #Acceptors.append([-inf]*currentSize)
- #Donors.append([-inf]*currentSize)
-
- exon_idx = int(exon_idx)
- currentExons = zeros((len(currentGene.exons),2))
- #for idx in range(len(currentGene.exons)):
- # currentExons[idx,0] = currentGene.exons[idx][0]-currentGene.start
- # currentExons[idx,1] = currentGene.exons[idx][1]-currentGene.start
-
- currentExons[0,0] = currentGene.exons[exon_idx-1][0]-currentGene.start
- currentExons[0,1] = currentGene.exons[exon_idx-1][1]-currentGene.start
-
- currentExons[1,0] = currentGene.exons[exon_idx][0]-currentGene.start
- currentExons[1,1] = currentGene.exons[exon_idx][1]-currentGene.start
-
- cut_offset = 500
- up_cut = currentExons[0,0] - cut_offset
- down_cut = currentExons[1,1] + cut_offset
-
- if up_cut < 0:
- up_cut = 0
-
- if down_cut > len(currentSeq):
- down_cut = len(currentSeq)
-
- currentSeq = currentSeq[up_cut:down_cut]
-
- Sequences.append(currentSeq)
-
- currentSize = len(Sequences[-1])
- Acceptors.append([0]*currentSize)
- Donors.append([0]*currentSize)
-
- Exons.append(currentExons)
- Ests.append(seq)
- Qualities.append(prb)
-
- SplitPositions.append(int(splitpos))
-
- if len(Sequences[-1]) == 2755:
- Sequences = Sequences[:-1]
- Acceptors = Acceptors[:-1]
- Donors = Donors[:-1]
- Exons = Exons[:-1]
- Ests = Ests[:-1]
- Qualities = Qualities[:-1]
-
- print 'found %d examples' % len(Sequences)
-
- return Sequences, Acceptors, Donors, Exons, Ests, Qualities
from numpy.linalg import norm
import QPalmaDP
-from SIQP_CPX import SIQPSolver
+import qpalma
+from qpalma.SIQP_CPX import SIQPSolver
from qpalma.DataProc import *
from generateEvaluationData import *
from numpy.linalg import norm
import QPalmaDP
-from SIQP_CPX import SIQPSolver
+import qpalma
+from qpalma.SIQP_CPX import SIQPSolver
from qpalma.DataProc import *
-from generateEvaluationData import *
+from qpalma.generateEvaluationData import *
+from qpalma.computeSpliceWeights import *
+from qpalma.set_param_palma import *
+from qpalma.computeSpliceAlignWithQuality import *
+from qpalma.penalty_lookup_new import *
+from qpalma.compute_donacc import *
+from qpalma.TrainingParam import Param
+from qpalma.export_param import *
-from computeSpliceWeights import *
-from set_param_palma import *
-from computeSpliceAlignWithQuality import *
-from penalty_lookup_new import *
-from compute_donacc import *
-from TrainingParam import Param
-from export_param import *
-
-import Configuration
-from Plif import Plf
-from Helpers import *
+import qpalma.Configuration
+from qpalma.Plif import Plf
+from qpalma.Helpers import *
def getQualityFeatureCounts(qualityPlifs):
weightQuality = qualityPlifs[0].penalties
def __init__(self):
self.ARGS = Param()
-
self.logfh = open('qpalma.log','w+')
- gen_file= '%s/genome.config' % self.ARGS.basedir
-
- ginfo_filename = 'genome_info.pickle'
- self.genome_info = fetch_genome_info(ginfo_filename)
- self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
+ #gen_file= '%s/genome.config' % self.ARGS.basedir
+ #ginfo_filename = 'genome_info.pickle'
+ #self.genome_info = fetch_genome_info(ginfo_filename)
+ #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
#self.ARGS.train_with_splicesitescoreinformation = False
Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
use_quality_scores = False
elif Configuration.mode == 'using_quality_scores':
- Sequences, Acceptors, Donors, Exons, Ests, Qualities = paths_load_data_solexa('training',self.genome_info,self.ARGS)
+ Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
#Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
exon_idx++;
if (ue != 0 && dov != 0)
- combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+ combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx-1);
if (uo != 0 && ds != 0)
- combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx);
+ combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx-1);
ue = uo = ds = dov = 0;
goto exon_label;