\begin{enumerate}
\item unique read id
\item chromosome/contig id
-\item strand
+\item strand [D/P or +/-]
\item beginning of sequence fragment
\item end of sequence fragment
\item read sequence (in strand specific direction) with alignment information (see below)
import array
import cPickle
+import numpy
import os
import os.path
import pdb
"""
- dataset = []
+ dataset = {}
half_window_size = settings['half_window_size']
+ # This tuple specifies an interval for valid Illumina Genome Analyzer quality values
+ if settings['platform'] == 'IGA':
+ quality_interval = illumina_ga_range
+
instance_counter = 0
accessWrapper = DataAccessWrapper(settings)
continue
# convert D/P strand info to +/-
- strand = ['-','+'][slist[2] == 'D']
+ strand = slist[2]
+
+ if strand in ['-','+']:
+ pass
+ elif strand in ['D','P']:
+ strand = ['-','+'][strand == 'D']
+ else:
+ print 'Error strand information has to be either +/- or D/P'
+ sys.exit(1)
seqBeginning = int(slist[3])
seqEnd = int(slist[4])
readAlignment = slist[5]
- prb = processQuality(slist[6],prb_offset,quality_intervals,perform_checks)
+ prb = processQuality(slist[6],settings['prb_offset'],quality_interval,settings['perform_checks'])
exons = numpy.mat([0,0,0,0]).reshape((2,2))
exons[1,0] = int(slist[9])
exons[1,1] = int(slist[10])
- dna = seqInfo.get_seq_and_scores(chr,strand,up_cut,down_cut,only_seq=True)
- relative_exons = exons - up_cut
+ dna = seqInfo.get_seq_and_scores(chromo,strand,seqBeginning,seqEnd,only_seq=True)
+ relative_exons = exons - seqBeginning
assert checkExons(dna,relative_exons,readAlignment,id)
def addToDataset(map_file,dataset,settings):
assert os.path.exists(map_file), 'Error: Can not find map file'
- perform_checks = settings['perform_checks']
-
- prb_offset = settings['prb_offset']
-
# This tuple specifies an interval for valid Illumina Genome Analyzer quality values
if settings['platform'] == 'IGA':
quality_interval = illumina_ga_range
dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
- prb = processQuality(slist[5],prb_offset,quality_interval,perform_checks)
+ prb = processQuality(slist[5],settings['prb_offset'],quality_interval,settings['perform_checks'])
currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
# now convert and check all integer parameters
for parameter in ['matchmatrixCols', 'matchmatrixRows', 'numAccSuppPoints', 'numConstraintsPerRound',\
'numDonSuppPoints', 'numLengthSuppPoints', 'numQualPlifs', 'numQualSuppPoints', 'anzpath', 'iter_steps',\
- 'max_intron_len', 'max_qual', 'min_intron_len', 'min_qual', 'totalQualSuppPoints','C','num_splits','prb_offset',\
+ 'max_intron_len', 'max_qual', 'min_intron_len', 'min_qual', 'C','num_splits','prb_offset',\
'half_window_size']:
try:
This class represents the cluster task of training QPalma.
"""
- def __init__(self):
- ClusterTask.__init__(self)
+ def __init__(self,settings):
+ ClusterTask.__init__(self,settings)
def CreateJobs(self):
def collectResults(self):
pass
+
def TrainingTaskStarter(settings,dataset_fn,training_keys,set_name):
accessWrapper = DataAccessWrapper(settings)
seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
from qpalma.gridtools import ApproximationTask,PreprocessingTask
from qpalma.gridtools import AlignmentTask,PostprocessingTask
+from qpalma.gridtools import TrainingTask
from qpalma.DatasetUtils import generatePredictionDataset,generateTrainingDataset
from qpalma.utils import logwrite
-
Errormsg = """Usage is:
python qpalma_pipeline.py train <config filename> <training data filename>
"""
+def printMessage(mes):
+ print '#'*80
+ print '\t\t\t%s...\n'%mes
+ print '#'*80
+
+
class System:
"""
This class wraps the outer loop of the qpalma project
"""
logwrite('Begin of training.\n',self.settings)
- print '#'*80
- print '\t\t\tStarting approximation...\n'
- print '#'*80
+ printMessage('Starting dataset generation')
self.settings['training_data_fn'] = training_data_fn
# Collect the data and create a pickled training set
generateTrainingDataset(self.settings)
+ printMessage('Starting training')
+
# Now that we have a dataset we can perform training
train_task = TrainingTask(self.settings)
train_task.CreateJobs()
logwrite('Begin of prediction.\n',self.settings)
- print '#'*80
- print '\t\t\tStarting approximation...\n'
- print '#'*80
+ printMessage('Starting approximation')
self.settings['prediction_param_fn'] = param_fn
self.settings['unspliced_reads_fn'] = unspliced_reads_fn
# After filtering combine the filtered matches from the first run and the
# found matches from the second run to a full dataset
- print '#'*80
- print '\t\t\tStarting dataset generation...\n'
- print '#'*80
+ printMessage('Starting dataset generation')
generatePredictionDataset(self.settings)
- print '#'*80
- print '\t\t\tStarting alignments...\n'
- print '#'*80
+ printMessage('Starting alignments')
# Now that we have a dataset we can perform accurate alignments
align_task = AlignmentTask(self.settings)
align_task.Submit()
align_task.CheckIfTaskFinished()
- print '#'*80
- print '\t\t\tPostprocessing...\n'
- print '#'*80
+ printMessage('Postprocessing')
# The results of the above alignment step can be converted to a data format
# needed for further postprocessing.
if __name__ == '__main__':
- if len(sys.argv) != 4 or len(sys.argv) != 6:
+ if len(sys.argv) != 4 and len(sys.argv) != 6:
+ print 'Invalid number of arguments!'
+ print len(sys.argv)
print Errormsg
sys.exit(1)
param_fn = sys.argv[3]
unspliced_reads_fn = sys.argv[4]
spliced_reads_fn = sys.argv[5]
- system_obj.prediction(unspliced_reads_fn, spliced_reads_fn)
+ system_obj.prediction(param_fn,unspliced_reads_fn, spliced_reads_fn)
elif mode == 'train':
training_data_fn = sys.argv[3]
system_obj.training(training_data_fn)
# These variables store the filename of the raw read data
#
-spliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map_2nd.vm
+#spliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map_2nd.vm
-unspliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map.vm
+#unspliced_reads_fn = /fml/ag-raetsch/home/fabio/tmp/sandbox/map.vm
#
# You can specify how many nodes you want to use via this variable
num_splits = 5
-#
-# The parameter you want to do prediction with:
-#
-
-prediction_param_fn = /fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+/param_526.pickle
-
#
# In order to align short reads QPalma relies on additional information such as
# splice site score prediction.