\item \emph{remapping}
\end{itemize}
+%
+%
+%
+\section{Installation}
+
+QPalma has the following requirements:
+\begin{itemize}
+\item Numpy
+\item In order to use QPalma on a cluster you need the pythongrid package which
+can be found under the following URL:
+\item For training you need either one of the following optimization toolkits:
+\begin{itemize}
+\item CPLEX
+\item CVXOPT
+\item MOSEK
+\end{itemize}
+\end{itemize}
+
+
%
%
%
\section{Pipeline}
-The full pipline constist of $n$ steps:
+The full pipline consists of $n$ steps:
\begin{enumerate}
\item Find alignment seeds using a fast suffix array method (vmatch) for
max_len = 0;
min_len = 0;
cache = 0;
- enum ETransformType transform = T_LINEAR;
+ //enum ETransformType transform = T_LINEAR;
id = 0;
name = 0;
use_svm = 0;
int* max_score_positions = new int[nr_paths*2];
Pre_score** matrices = new Pre_score*[nr_paths];
- for (int z=0; z<nr_paths; z++) {
+ //for (int z=0; z<nr_paths; z++) {
+ for (size_t z=0; z<nr_paths; z++) {
matrices[z] = new Pre_score[dna_len * est_len];
}
qualityFeaturesAllPaths= new penalty_struct*[nr_paths];
- for (int z=0; z<nr_paths; z++) {
+ //for (int z=0; z<nr_paths; z++) {
+ for (size_t z=0; z<nr_paths; z++) {
result_length = 0 ;
int* s_align = splice_align + (dna_len-1)*z; //pointer
if (use_quality_scores) {
penalty_struct currentPlif;
int ctr=0;
- for (int z=0; z<nr_paths; z++) {
+ //for (int z=0; z<nr_paths; z++) {
+ for (size_t z=0; z<nr_paths; z++) {
for(int estChar=1;estChar<6;estChar++) {
for(int dnaChar=0;dnaChar<6;dnaChar++) {
void Alignment::getAlignmentArrays(int* dna_align, int* est_align) {
- int idx;
+ size_t idx;
for(idx=0; idx<result_len; idx++) {
dna_align[idx] = DNA_ARRAY[idx];
est_align[idx] = EST_ARRAY[idx];
if (use_quality_scores) {
penalty_struct currentPlif;
int ctr=0;
- for (int z=0; z<nr_paths; z++) {
+ for (size_t z=0; z<nr_paths; z++) {
+ //for (int z=0; z<nr_paths; z++) {
for(int estChar=1;estChar<6;estChar++) {
for(int dnaChar=0;dnaChar<6;dnaChar++) {
import gridtools
+from utils import get_slices
+
jp = os.path.join
class ClusterTask(Thread):
"""
This class..
+
+ Every task creates a status file.
+ All cluster jobs submit then their exit status to this status file.
+
+ Job / status
+
+ 1 success
+ 2 failed
+ 3 waiting
+
+ means then job with id 1 terminated with success.
"""
- def __init__(self):
+ def __init__(self,global_settings):
self.sleep_time = 0
# this list stores the cluster/local jobs objects
self.functionJobs = []
+ self.global_settings = global_settings
+
+ #self.allJobIds = {}
+
def createJobs(self):
pass
def submit(self):
for current_job in self.functionJobs:
- (sid, jobids) = submit_jobs([functionJobs])
- time.sleep(self.sleep_time)
+ self.sid, self.jobids = submit_jobs([functionJobs])
+
+ def restart(self,id):
+ pass
def checkIfTaskFinished(self):
- pass
-
+ """
+ currentJobStatus = {}
+ for line in open(self.statusFile):
+ id,status = line.split()
+ currentJobStatus[id] = status
+
+ for key in self.allJobIds.keys():
+ try:
+ self.currentJobStatus[key]
+ except:
+ self.currentJobStatus[key] = 'no response'
+ """
+
+ print 'checking whether finished'
+ while not get_status(self.sid, self.jobids):
+ time.sleep(7)
+ print 'collecting jobs'
+ retjobs = collect_jobs(sid, jobids, myjobs)
+ print "ret fields AFTER execution on cluster"
+ for (i, job) in enumerate(retjobs):
+ print "Job #", i, "- ret: ", job.ret
+
+ print '--------------'
+
class ApproximationTask(ClusterTask):
def g_heuristic(run_fname,data_fname,param_fname,result_fname):
- #print run_fname,data_fname,param_fname,result_fname
ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_fname)
ph1.filter()
return 'finished filtering set %s.' % data_fname
def createJobs(self):
- num_splits = 25
+ num_splits = self.global_settings['num_splits']
run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
- #data_dir = '/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/'
-
- data_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
-
+ param_fname = jp(run_dir,'param_526.pickle')
run_fname = jp(run_dir,'run_obj.pickle')
- #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main/map.vm'
- #split_file(original_map_fname,data_dir,num_splits)
+ data_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
- param_fname = jp(run_dir,'param_526.pickle')
+ original_map_fname = self.global_settings['read_ascii_data_fn']
+ split_file(original_map_fname,data_dir,num_splits)
functionJobs=[]
ClusterTask.__init__(self)
- def get_slices(dataset_size,num_nodes):
- all_instances = []
-
- part = dataset_size / num_nodes
- begin = 0
- end = 0
- for idx in range(1,num_nodes+1):
-
- if idx == num_nodes:
- begin = end
- end = dataset_size
- else:
- begin = end
- end = begin+part
-
- params = (begin,end)
-
- all_instances.append(params)
-
- return all_instances
def makeJobs(run,dataset_fn,chunks,param):
"""
+ num_splits = self.global_settings['num_splits']
+
jp = os.path.join
run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
print 'Found %d keys for prediction.' % len(prediction_keys)
- num_splits = 25
- #num_splits = 1
slices = get_slices(len(prediction_keys),num_splits)
chunks = []
for idx,slice in enumerate(slices):
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
import math
import numpy.matlib
import QPalmaDP
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import pdb
+import os.path
+
+from numpy.matlib import mat,zeros,ones,inf
+
+import palma.palma_utils as pu
+from palma.output_formating import print_results
+
+# some useful constants
+
+extended_alphabet = ['-','a','c','g','t','n','[',']']
+alphabet = ['-','a','c','g','t','n']
+
+alignment_alphabet = ['-','a','c','g','t','n','z']
+
+
+def print_prediction(example):
+ dna_array = example['dna_array']
+ read_array = example['read_array']
+
+ dna = map(lambda x: alignment_alphabet[x],dna_array)
+ read = map(lambda x: alignment_alphabet[x],read_array)
+
+ spliceAlign = example['spliceAlign']
+ estAlign = example['estAlign']
+
+ line1,line2,line3 = pprint_alignment(spliceAlign, estAlign, dna, read)
+
+ result = '%s\n%s\n%s' %(line1,line2,line3)
+ return result
+
+def calc_info(acc,don,exons,qualities):
+ min_score = 100
+ max_score = -100
+
+ for elem in acc:
+ for score in elem:
+ if score != -inf:
+ min_score = min(min_score,score)
+ max_score = max(max_score,score)
+
+ print 'Acceptors max/min: %f / %f' % (max_score,min_score)
+
+ min_score = 100
+ max_score = -100
+
+ for elem in don:
+ for score in elem:
+ if score != -inf:
+ min_score = min(min_score,score)
+ max_score = max(max_score,score)
+
+ print 'Donor max/min: %f / %f' % (max_score,min_score)
+
+ min_score = 10000
+ max_score = 0
+ mean_intron_len = 0
+
+ for elem in exons:
+ intron_len = int(elem[1,0] - elem[0,1])
+ mean_intron_len += intron_len
+ min_score = min(min_score,intron_len)
+ max_score = max(max_score,intron_len)
+
+ mean_intron_len /= 1.0*len(exons)
+ print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
+
+ min_score = 10000
+ max_score = 0
+ mean_quality = 0
+
+ for qVec in qualities:
+ for elem in qVec:
+ min_score = min(min_score,elem)
+ max_score = max(max_score,elem)
+
+ print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
+
+
+def calc_stat(Acceptor,Donor,Exons):
+ maxAcc = -100
+ minAcc = 100
+ maxDon = -100
+ minDon = 100
+
+ acc_vec_pos = []
+ acc_vec_neg = []
+ don_vec_pos = []
+ don_vec_neg = []
+
+ for jdx in range(len(Acceptors)):
+ currentExons = Exons[jdx]
+ currentAcceptor = Acceptors[jdx]
+ currentAcceptor = currentAcceptor[1:]
+ currentAcceptor.append(-inf)
+ currentDonor = Donors[jdx]
+
+ for idx,elem in enumerate(currentAcceptor):
+ if idx == (int(currentExons[1,0])-1): # acceptor site
+ acc_vec_pos.append(elem)
+ else:
+ acc_vec_neg.append(elem)
+
+ for idx,elem in enumerate(currentDonor):
+ if idx == (int(currentExons[0,1])): # donor site
+ don_vec_pos.append(elem)
+ else:
+ don_vec_neg.append(elem)
+
+ acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
+ acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
+ don_pos = [elem for elem in don_vec_pos if elem != -inf]
+ don_neg = [elem for elem in don_vec_neg if elem != -inf]
+
+ for idx in range(len(Sequences)):
+ acc = [elem for elem in Acceptors[idx] if elem != -inf]
+ maxAcc = max(max(acc),maxAcc)
+ minAcc = min(min(acc),minAcc)
+
+ don = [elem for elem in Donors[idx] if elem != -inf]
+ maxDon = max(max(don),maxDon)
+ minDon = min(min(don),minDon)
+
+
+def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
+ (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
+ pu.get_splice_info(_newSpliceAlign,_newEstAlign)
+
+ t_strand = '+'
+ translation = '-ACGTN_' #how aligned est and dna sequence is displayed
+ #(gap_char, 5 nucleotides, intron_char)
+ comparison_char = '| ' #first: match_char, second: intron_char
+
+ (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
+ pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
+
+ first_identity = None
+ last_identity = None
+ for i in range(len(comparison)):
+ if comparison[i] == '|' and first_identity is None:
+ first_identity = i
+ if comparison[-i] == '|' and last_identity is None:
+ last_identity = len(comparison) - i - 1
+
+ try:
+ for idx in range(len(dna_array)):
+ dna_array[idx] = translation[int(dna_array[idx])]
+ est_array[idx] = translation[int(est_array[idx])]
+ except:
+ #pdb.set_trace()
+ pass
+
+ line1 = "".join(dna_array)
+ line2 = comparison
+ line3 = "".join(est_array)
+
+ return line1,line2,line3
+
+
+def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
+ return pu.get_splice_info(_newSpliceAlign,_newEstAlign)
+
+
+##########
+
+def split_file(filename,result_dir,num_parts):
+ """
+ Splits a file of n lines into num_parts parts
+ """
+
+ jp = os.path.join
+
+ all_intervals = []
+
+ print 'counting lines'
+ line_ctr = 0
+ for line in open(filename,'r'):
+ line_ctr += 1
+
+ print 'found %d lines' % line_ctr
+
+ part_size = line_ctr / num_parts
+ begin = 0
+ end = 0
+
+ for idx in range(1,num_parts+1):
+ begin = end
+
+ if idx == num_parts:
+ end = line_ctr
+ else:
+ end = begin+part_size+1
+
+ all_intervals.append((begin,end))
+
+ parts_fn = []
+ for pos,params in enumerate(all_intervals):
+ beg,end = params
+ out_fn = jp(result_dir,'map.part_%d'%pos)
+ print out_fn
+ parts_fn.append(out_fn)
+ out_fh = open(out_fn,'w+')
+
+ parts_fn.reverse()
+ all_intervals.reverse()
+
+ lineCtr = 0
+ beg = -1
+ end = -1
+ in_fh = open(filename,'r')
+ while True:
+ line = in_fh.readline()
+ if line == '':
+ break
+
+ if beg <= lineCtr < end:
+ out_fh.write(line)
+ lineCtr += 1
+ else:
+ (beg,end) = all_intervals.pop()
+ out_fn = parts_fn.pop()
+ out_fh.close()
+ out_fh = open(out_fn,'w+')
+ out_fh.write(line)
+
+ out_fh.close()
+
+
+def get_slices(dataset_size,num_nodes):
+ """
+ Given the size of the dataset and the number of nodes which shall be used
+ this function returns the slice of the dataset for each node.
+ """
+
+ all_instances = []
+
+ part = dataset_size / num_nodes
+ begin = 0
+ end = 0
+ for idx in range(1,num_nodes+1):
+
+ if idx == num_nodes:
+ begin = end
+ end = dataset_size
+ else:
+ begin = end
+ end = begin+part
+
+ params = (begin,end)
+
+ all_instances.append(params)
+
+ return all_instances
+
+
+if __name__ == '__main__':
+ split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import pdb
-import os.path
-
-from numpy.matlib import mat,zeros,ones,inf
-
-import palma.palma_utils as pu
-from palma.output_formating import print_results
-
-# some useful constants
-
-extended_alphabet = ['-','a','c','g','t','n','[',']']
-alphabet = ['-','a','c','g','t','n']
-
-alignment_alphabet = ['-','a','c','g','t','n','z']
-
-
-def print_prediction(example):
- dna_array = example['dna_array']
- read_array = example['read_array']
-
- dna = map(lambda x: alignment_alphabet[x],dna_array)
- read = map(lambda x: alignment_alphabet[x],read_array)
-
- spliceAlign = example['spliceAlign']
- estAlign = example['estAlign']
-
- line1,line2,line3 = pprint_alignment(spliceAlign, estAlign, dna, read)
-
- result = '%s\n%s\n%s' %(line1,line2,line3)
- return result
-
-def calc_info(acc,don,exons,qualities):
- min_score = 100
- max_score = -100
-
- for elem in acc:
- for score in elem:
- if score != -inf:
- min_score = min(min_score,score)
- max_score = max(max_score,score)
-
- print 'Acceptors max/min: %f / %f' % (max_score,min_score)
-
- min_score = 100
- max_score = -100
-
- for elem in don:
- for score in elem:
- if score != -inf:
- min_score = min(min_score,score)
- max_score = max(max_score,score)
-
- print 'Donor max/min: %f / %f' % (max_score,min_score)
-
- min_score = 10000
- max_score = 0
- mean_intron_len = 0
-
- for elem in exons:
- intron_len = int(elem[1,0] - elem[0,1])
- mean_intron_len += intron_len
- min_score = min(min_score,intron_len)
- max_score = max(max_score,intron_len)
-
- mean_intron_len /= 1.0*len(exons)
- print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
-
- min_score = 10000
- max_score = 0
- mean_quality = 0
-
- for qVec in qualities:
- for elem in qVec:
- min_score = min(min_score,elem)
- max_score = max(max_score,elem)
-
- print 'Quality values max/min: %d / %d mean' % (max_score,min_score)
-
-
-def calc_stat(Acceptor,Donor,Exons):
- maxAcc = -100
- minAcc = 100
- maxDon = -100
- minDon = 100
-
- acc_vec_pos = []
- acc_vec_neg = []
- don_vec_pos = []
- don_vec_neg = []
-
- for jdx in range(len(Acceptors)):
- currentExons = Exons[jdx]
- currentAcceptor = Acceptors[jdx]
- currentAcceptor = currentAcceptor[1:]
- currentAcceptor.append(-inf)
- currentDonor = Donors[jdx]
-
- for idx,elem in enumerate(currentAcceptor):
- if idx == (int(currentExons[1,0])-1): # acceptor site
- acc_vec_pos.append(elem)
- else:
- acc_vec_neg.append(elem)
-
- for idx,elem in enumerate(currentDonor):
- if idx == (int(currentExons[0,1])): # donor site
- don_vec_pos.append(elem)
- else:
- don_vec_neg.append(elem)
-
- acc_pos = [elem for elem in acc_vec_pos if elem != -inf]
- acc_neg = [elem for elem in acc_vec_neg if elem != -inf]
- don_pos = [elem for elem in don_vec_pos if elem != -inf]
- don_neg = [elem for elem in don_vec_neg if elem != -inf]
-
- for idx in range(len(Sequences)):
- acc = [elem for elem in Acceptors[idx] if elem != -inf]
- maxAcc = max(max(acc),maxAcc)
- minAcc = min(min(acc),minAcc)
-
- don = [elem for elem in Donors[idx] if elem != -inf]
- maxDon = max(max(don),maxDon)
- minDon = min(min(don),minDon)
-
-
-def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
- (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
- pu.get_splice_info(_newSpliceAlign,_newEstAlign)
-
- t_strand = '+'
- translation = '-ACGTN_' #how aligned est and dna sequence is displayed
- #(gap_char, 5 nucleotides, intron_char)
- comparison_char = '| ' #first: match_char, second: intron_char
-
- (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
- pu.comp_identity(dna_array, est_array, t_strand, qStart, tStart, translation, comparison_char)
-
- first_identity = None
- last_identity = None
- for i in range(len(comparison)):
- if comparison[i] == '|' and first_identity is None:
- first_identity = i
- if comparison[-i] == '|' and last_identity is None:
- last_identity = len(comparison) - i - 1
-
- try:
- for idx in range(len(dna_array)):
- dna_array[idx] = translation[int(dna_array[idx])]
- est_array[idx] = translation[int(est_array[idx])]
- except:
- #pdb.set_trace()
- pass
-
- line1 = "".join(dna_array)
- line2 = comparison
- line3 = "".join(est_array)
-
- return line1,line2,line3
-
-
-def get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
- return pu.get_splice_info(_newSpliceAlign,_newEstAlign)
-
-
-##########
-
-def split_file(filename,result_dir,num_parts):
- """
- Splits a file of n lines into num_parts parts
- """
-
- jp = os.path.join
-
- all_intervals = []
-
- print 'counting lines'
- line_ctr = 0
- for line in open(filename,'r'):
- line_ctr += 1
-
- print 'found %d lines' % line_ctr
-
- part_size = line_ctr / num_parts
- begin = 0
- end = 0
-
- for idx in range(1,num_parts+1):
- begin = end
-
- if idx == num_parts:
- end = line_ctr
- else:
- end = begin+part_size+1
-
- all_intervals.append((begin,end))
-
- parts_fn = []
- for pos,params in enumerate(all_intervals):
- beg,end = params
- out_fn = jp(result_dir,'map.part_%d'%pos)
- print out_fn
- parts_fn.append(out_fn)
- out_fh = open(out_fn,'w+')
-
- parts_fn.reverse()
- all_intervals.reverse()
-
- lineCtr = 0
- beg = -1
- end = -1
- in_fh = open(filename,'r')
- while True:
- line = in_fh.readline()
- if line == '':
- break
-
- if beg <= lineCtr < end:
- out_fh.write(line)
- lineCtr += 1
- else:
- (beg,end) = all_intervals.pop()
- out_fn = parts_fn.pop()
- out_fh.close()
- out_fh = open(out_fn,'w+')
- out_fh.write(line)
-
- out_fh.close()
-
-
-if __name__ == '__main__':
- split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)
-
-
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
import cPickle
import sys
import pdb
#
# This file contains the main interface to the QPalma pipeline.
#
-#
-#
from optparse import OptionParser
return parser
+global_settings = {\
+'experiment_dir':'/fml/ag-raetsch/...',\
+'read_ascii_data_fn':'/fml/ag-raetsch/...',\
+'num_splits':50
+'global_log_fn':'~/qpalma.log'
+}
parser = create_option_parser()
(options, args) = parser.parse_args()
+ def training(self):
+ """
+ This function is responsible for the whole training process. It first
+ converts the data to the right format needed by QPalma for the training
+ algorithm.
+ """
- def run(self):
+ pre_task = TrainingPreprocessingTask(global_settings,run_specific_settings)
+ pre_task.createJobs()
+ pre_task.submit()
+ while pre_task.checkIfTaskFinished() == False:
+ sleep(20)
+
+
+ def prediction(self):
+ """
+ This function encapsulates all steps needed to perform a prediction. Given
+ the parameter of the training and paths to a prediction set it will
+ generate several output files containing the spliced alignments
+ """
# Before creating a candidate spliced read dataset we have to first filter
# the matches from the first seed finding run.
- grid_heuristic()
-
- # approx_task = ApproximationTask(...)
- # approx_task.createJobs()
- # approx_task.submit()
- # approx_task.checkIfTaskFinished()
-
+ approx_task = ApproximationTask(config_obj)
+ approx_task.createJobs()
+ approx_task.submit()
+ approx_task.checkIfTaskFinished()
+
# After filtering combine the filtered matches from the first run and the
# found matches from the second run to a full dataset
- createNewDataset
-
- # pre_task = PreprocessingTask(...)
- # pre_task.createJobs()
- # pre_task.submit()
+ pre_task = PreprocessingTask(...)
+ pre_task.createJobs()
+ pre_task.submit()
+ while pre_task.checkIfTaskFinished() == False:
+ sleep(20)
# Now that we have a dataset we can perform the accurate alignments for this
# data
- grid_predict()
-
- # align_task = AlignmentTask(...)
- # align_task.createJobs()
- # align_task.submit()
+ align_task = AlignmentTask(...)
+ align_task.createJobs()
+ align_task.submit()
+ while align_task.checkIfTaskFinished() == False:
+ sleep(20)
# The results of the above alignment step can be converted to a data format
- # needed for further postprocessing
-
- grid_alignment()
+ # needed for further postprocessing.
- # post_task = PostprocessingTask(...)
- # post_task.createJobs()
- # post_task.submit()
+ post_task = PostprocessingTask(...)
+ post_task.createJobs()
+ post_task.submit()
+ while post_task.checkIfTaskFinished() == False:
+ sleep(20)
+ print "Success!"
+
if __name__ == '__main__':
system_obj = System()
ext_package = "qpalma",
#ext_modules = extmods,
package_dir = {"qpalma": "python"},
+ #ext_modules=Extension('QPalmaDP',[''])
packages = ["qpalma"])