import sys
import random
import os
+import re
import pdb
import pydb
import io_pickle
alphabet = ['-','a','c','g','t','n']
# some dummy elements to be returned if current example violates assertions
-nil_dna_frag = ('','','','')
-remapped_nil = ('','','')
+nil_dna_frag = ('','','','','')
+remapped_nil = ('')
process_nil = ('','','','','','')
nil_splice_scores = ('','')
assert os.path.exists(filtered_reads)
assert os.path.exists(remapped_reads)
-
assert not os.path.exists(dataset_file), 'The data_file already exists!'
joinp = os.path.join
RemappedReads = []
AlternativeSequences = []
- AlternativeAcceptors = []
- AlternativeDonors = []
# Iterate over all remapped reads in order to generate for each read an
# training / prediction example
original_est = currentFRead['seq']
- alternative_seq, alternative_acc, alternative_don = process_remapped_reads(reReads,currentFRead,dna_flat_files)
+ alternative_seq = process_remapped_reads(reReads,currentFRead,dna_flat_files)
if seq == '':
continue
#FReads.append(currentFRead)
AlternativeSequences.append(alternative_seq)
- AlternativeAcceptors.append(alternative_acc)
- AlternativeDonors.append(alternative_don)
instance_counter += 1
dataset = [Sequences, Acceptors, Donors,\
Exons, Ests, OriginalEsts, Qualities,\
- AlternativeSequences,\
- AlternativeAcceptors,\
- AlternativeDonors]
+ AlternativeSequences]
#dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
#'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
"""
- currentReadSeq = fRead['seq']
- currentReadSeq = currentReadSeq.lower()
-
quality = fRead['prb']
#quality = fRead['cal_prb']
#quality = fRead['chastity']
return process_nil
# fetch the dna fragment that represents the part of the two exons
- dnaFragment,currentExons,up_cut,down_cut = getDNAFragment(currentGene,genomicSeq,fRead)
+ dnaFragment,currentExons,up_cut,down_cut, currentReadSeq = getDNAFragment(currentGene,genomicSeq,fRead)
if dnaFragment == '':
return process_nil
#print 'leaving getDNAFragment...'
# end of exons/read borders calculation
- return currentDNASeq,currentExons,up_cut,down_cut
+ return currentDNASeq, currentExons, up_cut, down_cut, currentReadSeq
def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_offset):
strand = fRead['strand']
pos = fRead['p_start']
chr = fRead['chr']
- chrom = 'chr%d' % chr
- allDNASeq = []
- allAcc = []
- allDon = []
+ #allDNASeq = []
+ #allAcc = []
+ #allDon = []
+ #allLabels = []
+ alternativeSeq = []
for currentRRead in reReads:
rId = currentRRead['id']
# vmatch found a correct position
if pos + s1_pos == pos1:
- genomicSeq_start = pos
- genomicSeq_stop = fRead['p_stop']
+ genomicSeq_start = pos1
+ genomicSeq_stop = pos1 + fragment_size
+ currentLabel = True
else:
genomicSeq_start = pos1
genomicSeq_stop = pos1 + fragment_size
+ currentLabel = False
#return remapped_nil
- genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
- genomicSeq = genomicSeq.lower()
+ #currentDNASeq, currentAcc, currentDon =\
+ #get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files)
- # check the obtained dna sequence
- assert genomicSeq != '', 'load_genomic returned empty sequence!'
- for elem in genomicSeq:
- assert elem in alphabet, pdb.set_trace()
+ #allDNASeq.append(currentDNASeq)
+ #allAcc.append(currentAcc)
+ #allDon.append(currentDon)
+ #allLabels.append(currentLabel)
+ alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop) )
- intervalBegin = genomicSeq_start-100
- intervalEnd = genomicSeq_stop+100
- currentDNASeq = genomicSeq
- seq_pos_offset = genomicSeq_start
+ #currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files)
- #pydb.debugger()
+ #return allDNASeq, allAcc, allDon, allLabels
+ return alternativeSeq
+
+
+def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
+ """
+ This function expects an interval, chromosome and strand information and
+ returns then the genomic sequence of this interval and the associated scores.
+ """
+
+ chrom = 'chr%d' % chr
+ genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
+ genomicSeq = genomicSeq.lower()
+
+ # check the obtained dna sequence
+ assert genomicSeq != '', 'load_genomic returned empty sequence!'
+ #for elem in genomicSeq:
+ # if not elem in alphabet:
+
+ no_base = re.compile('[^acgt]')
+ genomicSeq = no_base.sub('n',genomicSeq)
- currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_offset)
+ intervalBegin = genomicSeq_start-100
+ intervalEnd = genomicSeq_stop+100
+ currentDNASeq = genomicSeq
+ seq_pos_offset = genomicSeq_start
- allDNASeq.append(currentDNASeq)
- allAcc.append(currentAcc)
- allDon.append(currentDon)
+ currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_offset)
- return allDNASeq, allAcc, allDon
+ return currentDNASeq, currentAcc, currentDon
def reverse_complement(seq):
map = {'a':'t','c':'g','g':'c','t':'a'}
# quality scores.
#
# This file represents the conversion of the main matlab
-# training loop for Palma to python.
+# training loop for Palma to Python.
#
# Author: Fabio De Bona
#
import qpalma
from qpalma.SIQP_CPX import SIQPSolver
from qpalma.DataProc import *
-from qpalma.generateEvaluationData import *
from qpalma.computeSpliceWeights import *
from qpalma.set_param_palma import *
from qpalma.computeSpliceAlignWithQuality import *
from qpalma.penalty_lookup_new import *
from qpalma.compute_donacc import *
-from qpalma.export_param import *
+#from qpalma.export_param import *
from qpalma.TrainingParam import Param
from qpalma.Plif import Plf
from qpalma.tools.splicesites import getDonAccScores
from qpalma.Configuration import *
-from compile_dataset import compile_d
-
import palma.palma_utils as pu
from palma.output_formating import print_results
def __init__(self,run):
self.ARGS = Param()
-
self.run = run
- #data_filename = self.run['dataset_filename']
- #Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
-
- # Load the whole dataset
if self.run['mode'] == 'normal':
self.use_quality_scores = False
else:
assert(False)
- #self.Sequences = Sequences
- #self.Exons = Exons
- #self.Ests = Ests
- #self.OriginalEsts= OriginalEsts
- #self.Qualities = Qualities
- #self.SplitPos = SplitPos
- #self.Donors = Donors
- #self.Acceptors = Acceptors
+ full_working_path = os.path.join(run['experiment_path'],run['name'])
+ #assert not os.path.exists(full_working_path)
+ #os.mkdir(full_working_path)
+ assert os.path.exists(full_working_path)
- #calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
- #print 'leaving constructor...'
+ # ATTENTION: Changing working directory
+ os.chdir(full_working_path)
+
+ cPickle.dump(run,open('run_object.pickle','w+'))
def plog(self,string):
self.logfh.write(string)
def train(self):
run = self.run
- full_working_path = os.path.join(run['experiment_path'],run['name'])
- assert not os.path.exists(full_working_path)
- os.mkdir(full_working_path)
- assert os.path.exists(full_working_path)
-
- # ATTENTION: Changing working directory
- os.chdir(full_working_path)
-
self.logfh = open('_qpalma_train.log','w+')
self.plog("Settings are:\n")
self.plog("%s\n"%str(run))
data_filename = self.run['dataset_filename']
- Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
+ Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
+ AlternativeSequences =\
+ paths_load_data(data_filename,'training',None,self.ARGS)
# Load the whole dataset
if self.run['mode'] == 'normal':
self.Ests = Ests
self.OriginalEsts= OriginalEsts
self.Qualities = Qualities
- self.SplitPos = SplitPos
self.Donors = Donors
self.Acceptors = Acceptors
# Initialize solver
self.plog('Initializing problem...\n')
solver = SIQPSolver(run['numFeatures'],numExamples,run['C'],self.logfh,run)
+
#solver.enforceMonotonicity(lengthSP,lengthSP+donSP)
#solver.enforceMonotonicity(lengthSP+donSP,lengthSP+donSP+accSP)
###############################################################################
- def evaluate(self,param_filename,dataset):
+ def evaluate(self,param_filename):
run = self.run
+ beg = run['prediction_begin']
+ end = run['prediction_end']
- data = dataset
- self.Sequences = data['Sequences']
- self.Acceptors = data['Acceptors']
- self.Donors = data['Donors']
- self.Exons = data['Exons']
- self.Ests = data['Ests']
- self.OriginalEsts= data['OriginalEsts']
- self.Qualities = data['Qualities']
- self.SplitPositions = data['SplitPositions']
+ data_filename = self.run['dataset_filename']
+ Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
+ AlternativeSequences=\
+ paths_load_data(data_filename,'training',None,self.ARGS)
+
+ self.Sequences = Sequences
+ self.Exons = Exons
+ self.Ests = Ests
+ self.OriginalEsts= OriginalEsts
+ self.Qualities = Qualities
+ self.Donors = Donors
+ self.Acceptors = Acceptors
- self.FReads = data['FReads']
+ self.AlternativeSequences = AlternativeSequences
- beg = run['prediction_begin']
- end = run['prediction_end']
+ #calc_info(self.Acceptors,self.Donors,self.Exons,self.Qualities)
+ #print 'leaving constructor...'
self.logfh = open('_qpalma_predict.log','w+')
# predict on training set
print '##### Prediction on the training set #####'
- #self.predict(param_filename,0,beg)
+ self.predict(param_filename,0,beg)
# predict on test set
print '##### Prediction on the test set #####'
#pdb.set_trace()
def predict(self,param_filename,beg,end):
+ """
+ Performing a prediction takes...
+
+ """
+
run = self.run
if self.run['mode'] == 'normal':
Qualities = self.Qualities[beg:end]
Acceptors = self.Acceptors[beg:end]
Donors = self.Donors[beg:end]
- SplitPos = self.SplitPositions[beg:end]
+ #SplitPos = self.SplitPositions[beg:end]
+ #FReads = self.FReads[beg:end]
- FReads = self.FReads[beg:end]
+ AlternativeSequences = self.AlternativeSequences[beg:end]
print 'STARTING PREDICTION'
param = cPickle.load(open(param_filename))
# Set the parameters such as limits penalties for the Plifs
- #pdb.set_trace()
[h,d,a,mmatrix,qualityPlifs] =\
set_param_palma(param,self.ARGS.train_with_intronlengthinformation,run)
currentPhi = zeros((self.run['numFeatures'],1))
totalQualityPenalties = zeros((totalQualSP,1))
+ # some datastructures storing the predictions
exon1Begin = []
exon1End = []
exon2Begin = []
allPredictions = []
+ # beginning of the prediction loop
+ #
for exampleIdx in range(numExamples):
dna = Sequences[exampleIdx]
est = Ests[exampleIdx]
- fRead = FReads[exampleIdx]
- est = fRead['seq']
-
new_est = ''
e = 0
while True:
new_est += est[e]
e += 1
- #pdb.set_trace()
est = new_est
- #pdb.set_trace()
print exampleIdx
est = "".join(est)
est = est.lower()
est = est.replace('-','')
- #original_est = OriginalEsts[exampleIdx]
- #original_est = "".join(original_est)
- #original_est = original_est.lower()
- original_est = est
-
- currentSplitPos = SplitPos[exampleIdx]
+ original_est = OriginalEsts[exampleIdx]
+ original_est = "".join(original_est)
+ original_est = original_est.lower()
+ #currentSplitPos = SplitPos[exampleIdx]
#import re
#num_hits = len(re.findall('\[',original_est[:currentSplitPos]))
if not run['enable_quality_scores']:
quality = [40]*len(est)
- quality = fRead['prb']
+ #quality = fRead['prb']
exons = zeros((2,2))
currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
- ################## Calculate wrong alignment(s) ######################
+ ################## Calculate alignment(s) ######################
# Compute donor, acceptor with penalty_lookup_new
# returns two double lists
self.plog(line2+'\n')
self.plog(line3+'\n')
- e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,currentSplitPos,exampleIdx)
+ e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign,exampleIdx)
one_prediction = {'e1_b_off':e1_b_off,'e1_e_off':e1_e_off,
'e2_b_off':e2_b_off ,'e2_e_off':e2_e_off,\
print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
- #print 'total correct'
- #print correct_rest2error
-
- #import pylab
- #
- #y_values = []
- #for overlap in range(36):
- # result = 0
- # try:
- # cv = off_rest2error[overlap]
- # except:
- # cv = 0
-
- # result += cv
-
- # try:
- # cv = wrong_rest2error[overlap]
- # except:
- # cv = 0
-
- # result += cv
-
- # y_values.append(result/(1.0*numExamples))
-
- #pylab.bar(range(36),y_values,width=0.2,align='center')
- #pylab.xticks( numpy.arange(0,36,1) )
- #pylab.savefig('%s_error_vs_overlap.eps'%run['name'])
- #pylab.clf()
- #raw_input()
cPickle.dump(allPredictions,open('%s_allPredictions'%run['name'],'w+'))
- #print 'some positions off'
- #print off_rest2error
- #import pylab
- #pylab.bar(off_rest2error.keys(),off_rest2error.values(),width=0.2,align='center')
- ##pylab.show()
- #pylab.xticks( numpy.arange(0,max(off_rest2error.keys())+1,1) )
- #pylab.savefig('%s_offset.eps'%run['name'])
- #pylab.clf()
- ##raw_input()
-
- #print 'total wrong'
- #print wrong_rest2error
- #import pylab
- #pylab.bar(wrong_rest2error.keys(),wrong_rest2error.values(),width=0.2,align='center')
- #pylab.xticks( numpy.arange(0,max(wrong_rest2error.keys())+1,1) )
- #pylab.savefig('%s_wrong.eps'%run['name'])
- #pylab.clf()
-
print 'Prediction completed'
def evaluatePositions(self,eBegin,eEnd):
return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
- def evaluateExample(self,dna,est,exons,SpliceAlign,newEstAlign,spos,exampleIdx):
+ def evaluateExample(self,dna,est,exons,SpliceAlign,newEstAlign,exampleIdx):
newExons = []
oldElem = -1
SpliceAlign = SpliceAlign.flatten().tolist()[0]
return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
-def fetch_genome_info(ginfo_filename):
- if not os.path.exists(ginfo_filename):
- cmd = ['']*4
- cmd[0] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/utils'
- cmd[1] = 'addpath /fml/ag-raetsch/home/fabio/svn/tools/genomes'
- cmd[2] = 'genome_info = init_genome(\'%s\')' % gen_file
- cmd[3] = 'save genome_info.mat genome_info'
- full_cmd = "matlab -nojvm -nodisplay -r \"%s; %s; %s; %s; exit\"" % (cmd[0],cmd[1],cmd[2],cmd[3])
-
- obj = subprocess.Popen(full_cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
- out,err = obj.communicate()
- assert err == '', 'An error occured!\n%s'%err
-
- ginfo = scipy.io.loadmat('genome_info.mat')
- cPickle.dump(self.genome_info,open(ginfo_filename,'w+'))
- return ginfo['genome_info']
-
- else:
- return cPickle.load(open(ginfo_filename))
-
-
-
def calcStat(Acceptor,Donor,Exons):
maxAcc = -100
minAcc = 100
don_pos = [elem for elem in don_vec_pos if elem != -inf]
don_neg = [elem for elem in don_vec_neg if elem != -inf]
- #pdb.set_trace()
-
for idx in range(len(Sequences)):
acc = [elem for elem in Acceptors[idx] if elem != -inf]
maxAcc = max(max(acc),maxAcc)
###########################
if __name__ == '__main__':
- qpalma = QPalma()
-
mode = sys.argv[1]
+ run_obj_fn = sys.argv[2]
+
+ run_obj = cPickle.load(open(run_obj_fn))
+
+ qpalma = QPalma(run_obj)
+
- if len(sys.argv) == 2 and mode == 'train':
+ if len(sys.argv) == 3 and mode == 'train':
qpalma.train()
- elif len(sys.argv) == 3 and mode == 'predict':
- param_filename = sys.argv[2]
+ elif len(sys.argv) == 4 and mode == 'predict':
+ param_filename = sys.argv[3]
assert os.path.exists(param_filename)
qpalma.evaluate(param_filename)
else: