#include <sys/mman.h>
#include <sys/stat.h>
-
#include "read.h"
#define WITH_QUALITIES 0
initqparser();
}
-
-/*
-static int set_item_from_line(PyObject *result_dict, const char* current_line) {
-
- // increment the reference count for the result_dict object because we want
- // ot modify it
- Py_INCREF( result_dict );
-
- //printf("current line is %s\n",current_line);
- int entries_found = sscanf(current_line,line_format,&id,&chr,&strand,seq,&splitpos,&size,prb,cal_prb,chastity,geneId,&p_start,&exon_stop,&exon_start,&p_stop,&true_cut);
-
- if (entries_found != 15) {
- return entries_found;
- }
-
- //printf("after sscanf\n");
- int status;
-
- // create dictionary representing one line
- //PyObject* entry_dict = PyDict_New();
-
- // alternative way using a list instead of a dictionary
- PyObject* entry_list = PyList_New(15);
- Py_INCREF( entry_list );
-
- PyObject *id_py = PyInt_FromLong(id);
- PyObject *strand_py = PyString_FromString("--");
-
- if ( strand == 'D' )
- strand_py = PyString_FromString("+");
-
- if ( strand == 'P' )
- strand_py = PyString_FromString("-");
-
- //printf("before : %s\n",seq);
- Py_ssize_t idx;
- for(idx=0;idx<strlen(seq);idx++) {
- if ( 65 <= seq[idx] && seq[idx] < 85)
- seq[idx] = seq[idx]+32;
- }
- //printf("after : %s\n",seq);
-
- // add entries of that line
-
- //status = PyDict_SetItem(entry_dict, PyString_FromString("id"), id_py );
- //status = PyDict_SetItem(entry_dict, PyString_FromString("chr"), PyInt_FromLong(chr) );
- //status = PyDict_SetItem(entry_dict, PyString_FromString("seq"), PyString_FromString(seq) );
- //status = PyDict_SetItem(entry_dict, PyString_FromString("strand"), strand_py );
- //status = PyDict_SetItem(entry_dict, PyString_FromString("splitpos"), PyInt_FromLong(splitpos) );
- //status = PyDict_SetItem(entry_dict, PyString_FromString("read_size"), PyInt_FromLong(size) );
- //status = PyDict_SetItem(entry_dict, PyString_FromString("true_cut"), PyInt_FromLong(true_cut) );
-
- status = PyList_SetItem(entry_list, 0, id_py );
- status = PyList_SetItem(entry_list, 1, PyInt_FromLong(chr) );
- status = PyList_SetItem(entry_list, 2, PyString_FromString(seq) );
- status = PyList_SetItem(entry_list, 3, strand_py );
- status = PyList_SetItem(entry_list, 4, PyInt_FromLong(splitpos) );
- status = PyList_SetItem(entry_list, 5, PyInt_FromLong(size) );
- status = PyList_SetItem(entry_list, 6, PyInt_FromLong(true_cut) );
-
- PyObject* prb_list = PyList_New(size);
- PyObject* cal_prb_list = PyList_New(size);
- PyObject* chastity_list = PyList_New(size);
-
-#if WITH_QUALITIES
- for(idx=0;idx<size;idx++) {
- status = PyList_SetItem( prb_list, idx, PyInt_FromLong(prb[idx]-50) );
- status = PyList_SetItem( cal_prb_list, idx, PyInt_FromLong(cal_prb[idx]-64) );
- status = PyList_SetItem( chastity_list, idx, PyInt_FromLong(chastity[idx]+10) );
- }
-
- status = PyList_SetItem(entry_list, 7, prb_list );
- status = PyList_SetItem(entry_list, 8, cal_prb_list );
- status = PyList_SetItem(entry_list, 9, chastity_list );
-#else
- status = PyList_SetItem(entry_list, 7, PyString_FromString("") );
- status = PyList_SetItem(entry_list, 8, PyString_FromString("") );
- status = PyList_SetItem(entry_list, 9, PyString_FromString("") );
-#endif
-
-
- status = PyList_SetItem(entry_list, 10, PyString_FromString(geneId) );
- status = PyList_SetItem(entry_list, 11, PyInt_FromLong(p_start) );
- status = PyList_SetItem(entry_list, 12, PyInt_FromLong(exon_stop) );
- status = PyList_SetItem(entry_list, 13, PyInt_FromLong(exon_start) );
- status = PyList_SetItem(entry_list, 14, PyInt_FromLong(p_stop) );
-
- //status = PyDict_SetItem(entry_dict, PyString_FromString("prb"), prb_list );
- //status = PyDict_SetItem(entry_dict, PyString_FromString("cal_prb"), cal_prb_list );
- //status = PyDict_SetItem(entry_dict, PyString_FromString("chastity"), chastity_list );
- //status = PyDict_SetItem(entry_dict, PyString_FromString("gene_id"), PyString_FromString(geneId) );
- //status = PyDict_SetItem(entry_dict, PyString_FromString("p_start"), PyInt_FromLong(p_start) );
- //status = PyDict_SetItem(entry_dict, PyString_FromString("exon_stop"), PyInt_FromLong(exon_stop) );
- //status = PyDict_SetItem(entry_dict, PyString_FromString("exon_start"), PyInt_FromLong(exon_start) );
- //status = PyDict_SetItem(entry_dict, PyString_FromString("p_stop"), PyInt_FromLong(p_stop) );
-
- // now save the dictionary representing one line in the dictionary
- // representing the whole file
- //status = PyDict_SetItem(result_dict, id_py, entry_dict);
-
- status = PyDict_SetItem(result_dict, id_py, entry_list);
- if (status != 0) {
- PyErr_Warn(PyExc_Warning, "qparser.parse_reads: Failed to add item!");
- }
-
- Py_DECREF( entry_list );
- // decrement the reference count as we are finished with the local
- // modification of the object
- Py_DECREF( result_dict );
-
- return status;
-}
-*/
--- /dev/null
+#!/bin/bash
+
+#
+# The purpose of this script is to create a python package
+# for QPalma
+#
+
+name=qpalma-0.8
+tmp_dir=/tmp/$name
+result_fn=/tmp/$name.tar.gz
+
+# first we create a temporary directory to store the needed files:
+mkdir $tmp_dir
+
+# copy all needed files to the package directory
+cp setup.py $tmp_dir
+
+# create a gzipped tarball containing the QPalma package
+cd $tmp_dir
+cd ..
+tar -czvf $result_fn $name
+
+echo "Finished package-building result is under $result_fn"
--- /dev/null
+#!/usr/bin/env python
+# -*- coding:latin-1 -*-
+
+import subprocess
+
+class PipelineStep:
+ """
+ The purpose of this class is to wrap a pipeline step which can be the
+ execution of a binary a small sequences of shell commands
+ """
+
+ def __init__(self,cL):
+ self.commandList = cL
+
+ def __call__(self):
+ success,pre_mes = self.checkPreConditions()
+ if not success:
+ return False,pre_mes
+
+ all_output = []
+ for currentCommand in self.commandList:
+ streams = subprocess.Popen(currentCommand,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+ out,err = streams.communicate()
+
+ if err != '':
+ return False,err
+
+ all_output.append(out)
+
+ success,post_mes = self.checkPostConditions()
+ if not success:
+ return False,post_mes
+
+ return True,all_output
+
+
+ def checkPreConditions(self):
+ return True,''
+
+
+ def checkPostConditions(self):
+ return True,''
+
+
+if __name__ == '__main__':
+ macro = PipelineStep(['date','w'])
+ status,mes = macro()
+ print status
+ print mes
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import random
-import os
-import re
-import pdb
-import random
-
-from numpy.matlib import inf
-from qpalma.sequence_utils import SeqSpliceInfo,get_flat_file_size,DataAccessWrapper
-
-
-class LookupTable:
- """
- Speed up genomic and splice site score queries by prefetching the sequences
- and the scores once for all chromosomes.
- """
-
- def __init__(self,genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt,chromo_list):
- accessWrapper = DataAccessWrapper(genomic_data_dir,acc_score_dir,don_score_dir,gen_fmt,sscore_fmt)
- self.seqInfo = SeqSpliceInfo(flat_files,chromo_list)
-
- self.strands = ['+','-']
-
- # take care that it also works say if we get a chromo_list [1,3,5]
- # those are mapped then to 0,1,2
- self.chromo_list = chromo_list
- self.chromo_map = []
- for elem in chromo_list:
- self.chromo_map.append(elem)
-
- self.genomicSequencesP = [0]*(len(self.chromo_map)+1)
- self.acceptorScoresP = [0]*(len(self.chromo_map)+1)
- self.donorScoresP = [0]*(len(self.chromo_map)+1)
-
- self.genomicSequencesN = [0]*(len(self.chromo_map)+1)
- self.acceptorScoresN = [0]*(len(self.chromo_map)+1)
- self.donorScoresN = [0]*(len(self.chromo_map)+1)
-
- for chrId in self.chromo_list:
- for strandId in self.strands:
- print (chrId,strandId)
- self.prefetch_seq_and_scores(chrId,strandId)
-
-
- def get_seq_and_scores(self,chromo,strand,start,stop,dna_flat_files):
- assert chromo in self.chromo_list
- assert strand in self.strands
-
- chromo_idx = self.chromo_map[chromo]
-
- if strand == '+':
- currentSequence = self.genomicSequencesP[chromo_idx][start:stop]
- currentDonorScores = self.donorScoresP[chromo_idx][start:stop]
- currentAcceptorScores = self.acceptorScoresP[chromo_idx][start:stop]
- elif strand == '-':
- currentSequence = self.genomicSequencesN[chromo_idx][start:stop]
- currentDonorScores = self.donorScoresN[chromo_idx][start:stop]
- currentAcceptorScores = self.acceptorScoresN[chromo_idx][start:stop]
- else:
- assert False
-
- return currentSequence, currentAcceptorScores, currentDonorScores
-
-
- def prefetch_seq_and_scores(self,chromo,strand):
- """
- This function expects an interval, chromosome and strand information and
- returns then the genomic sequence of this interval and the associated scores.
- """
-
- chromo_idx = self.chromo_map[chromo]
-
- genomicSeq_start = 0
-
- genomicSeq,currentAcc,currentDon = self.seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
- genomicSeq = genomicSeq.lower()
-
- if strand == '+':
- self.genomicSequencesP[chromo_idx] = genomicSeq
- self.acceptorScoresP[chromo_idx] = currentAcc
- self.donorScoresP[chromo_idx] = currentDon
- elif strand == '-':
- self.genomicSequencesN[chromo_idx] = genomicSeq
- self.acceptorScoresN[chromo_idx] = currentAcc
- self.donorScoresN[chromo_idx] = currentDon
- else:
- assert False
accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
- # implement uniqueness filtering using alignment score for reads with
- # several alignments
+ # Uniqueness filtering using alignment score for reads with several
+ # alignments
+ allUniquePredictions = {}
- #for pred_idx,current_prediction in enumerate(allPredictions):
-
+ for new_prediction in allPredictions:
+ id = new_prediction['id']
+ if allUniquePredictions.has_key(id):
+ current_prediction = allUniquePredictions[id]
+
+ current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
+ new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
+
+ if current_a_score < new_score :
+ allUniquePredictions[id] = new_prediction
+
+ else:
+ allUniquePredictions[id] = new_prediction
written_lines = 0
- for pred_idx,current_prediction in enumerate(allPredictions):
+ for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
+
id = current_prediction['id']
#current_ground_truth = qparser.fetch_read(id)
from pythongrid import KybJob, Usage
from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
-from createAlignmentFileFromPrediction import *
+from createAlignmentFileFromPrediction import create_alignment_file
import grid_alignment
+
def g_alignment(chunk_fn,result_fn):
create_alignment_file(chunk_fn,result_fn)
def create_and_submit():
jp = os.path.join
- #run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
- #run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
- #run_dir = '/fml/ag-raetsch/home/fabio/tmp/sandbox/alignment'
- #data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
- #data_dir = '/fml/ag-raetsch/home/fabio/tmp/sandbox/alignment'
-
- run_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
- data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
-
- #run_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
- #data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
+ run_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/prediction'
+ result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/alignment'
chunks_fn = []
for fn in os.listdir(run_dir):
chunks_fn.append(fn)
#chunks_fn = [\
- #'chunk_0.predictions.pickle',\
- #'chunk_4.predictions.pickle']
+ #'chunk_9.predictions.pickle',\
+ #'chunk_14.predictions.pickle',\
+ #'chunk_24.predictions.pickle']
print chunks_fn
for chunk_fn in chunks_fn:
chunk_name = chunk_fn[:chunk_fn.find('.')]
- result_fn = jp(data_dir,'%s.align_remap'%chunk_name)
+ result_fn = jp(result_dir,'%s.align_remap'%chunk_name)
chunk_fn = jp(run_dir,chunk_fn)
#pdb.set_trace()
(sid, jobids) = submit_jobs(functionJobs)
time.sleep(10)
- #break
if __name__ == '__main__':
create_and_submit()
data_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
-
run_fname = jp(run_dir,'run_obj.pickle')
- #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm'
- #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/new_map.vm'
- original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main/map.vm'
- split_file(original_map_fname,data_dir,num_splits)
+ #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main/map.vm'
+ #split_file(original_map_fname,data_dir,num_splits)
param_fname = jp(run_dir,'param_526.pickle')
return all_instances
-def makeJobs(run,dataset_fn,chunks,param):
+def makeJobs(run,dataset_fn,chunks,param_fn):
"""
"""
jobs=[]
for c_name,current_chunk in chunks:
- current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param,c_name])
+ current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param_fn,c_name])
current_job.h_vmem = '20.0G'
#current_job.express = 'True'
run = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
run['name'] = 'saved_run'
- param = cPickle.load(open(jp(run_dir,'param_526.pickle')))
+ param_fn = jp(run_dir,'param_526.pickle')
- #dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/sandbox/dataset_neg_strand_testcase.pickle'
- #prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/sandbox/dataset_neg_strand_testcase.keys.pickle'
-
- #dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.pickle'
- #prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.keys.pickle'
-
- #run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
- #dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/dataset_run_1.pickle.pickle'
- #prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/dataset_run_1.pickle.keys.pickle'
-
- run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
- dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/dataset_run_2.pickle.pickle'
- prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/dataset_run_2.pickle.keys.pickle'
+ run['result_dir'] = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/prediction'
+ dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.pickle'
+ prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/dataset_run_1.pickle.keys.pickle'
prediction_keys = cPickle.load(open(prediction_keys_fn))
c_name = 'chunk_%d' % idx
chunks.append((c_name,prediction_keys[slice[0]:slice[1]]))
- functionJobs = makeJobs(run,dataset_fn,chunks,param)
+ functionJobs = makeJobs(run,dataset_fn,chunks,param_fn)
sum = 0
for size in [len(elem) for name,elem in chunks]:
sum += size
- #assert sum == len(prediction_keys)
-
print 'Got %d job(s)' % len(functionJobs)
#print "output ret field in each job before sending it onto the cluster"
#for (i, job) in enumerate(functionJobs):
# print "Job with id: ", i, "- ret: ", job.ret
-
#print ""
#print "sending function jobs to cluster"
#print ""
- #pdb.set_trace()
-
(sid, jobids) = submit_jobs(functionJobs)
- #print 'checking whether finished'
- #while not get_status(sid, jobids):
- # time.sleep(7)
- #print 'collecting jobs'
- #retjobs = collect_jobs(sid, jobids, functionJobs)
- #print "ret fields AFTER execution on cluster"
- #for (i, job) in enumerate(retjobs):
- # print "Job #", i, "- ret: ", job.ret
-
- #print '--------------'
-
-def g_predict(run,dataset_fn,prediction_keys,param,set_name):
+def g_predict(run,dataset_fn,prediction_keys,param_fn,set_name):
"""
"""
- qp = QPalma()
- qp.predict(run,dataset_fn,prediction_keys,param,set_name)
-
+ qp = QPalma(False)
+ qp.init_prediction(run,dataset_fn,prediction_keys,param_fn,set_name)
return 'finished prediction of set %s.' % set_name
#
###########################################################
-import sys
+import array
import cPickle
-import pdb
import os.path
-import array
+import pdb
+import sys
-from qpalma.sequence_utils import get_seq_and_scores,reverse_complement,unbracket_seq
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,unbracket_seq
import numpy
from numpy.matlib import mat,zeros,ones,inf
the alignment.
"""
- def __init__(self):
+ def __init__(self,dmode=False):
self.ARGS = Param()
+ self.qpalma_debug_mode = dmode
def plog(self,string):
#
###############################################################################
- def predict(self,run,dataset_fn,prediction_keys,param,set_name):
+ def init_prediction(self,run,dataset_fn,prediction_keys,param_fn,set_name):
"""
Performing a prediction takes...
"""
self.run = run
+ self.set_name = set_name
#full_working_path = os.path.join(run['alignment_dir'],run['name'])
full_working_path = run['result_dir']
self.logfh = open('_qpalma_predict_%s.log'%set_name,'w+')
- if self.run['mode'] == 'normal':
- self.use_quality_scores = False
-
- elif self.run['mode'] == 'using_quality_scores':
- self.use_quality_scores = True
- else:
- assert(False)
# number of prediction instances
self.plog('Number of prediction examples: %d\n'% len(prediction_keys))
# we do not need the full dataset anymore
del dataset
+
+ # Load parameter vector to predict with
+ param = cPickle.load(open(param_fn))
+
+ self.predict(run,prediction_set,param)
+
+
+ def predict(self,run,prediction_set,param):
+ """
+ This method...
+ """
+
+ self.run = run
+
+ if self.run['mode'] == 'normal':
+ self.use_quality_scores = False
+
+ elif self.run['mode'] == 'using_quality_scores':
+ self.use_quality_scores = True
+ else:
+ assert(False)
# Set the parameters such as limits/penalties for the Plifs
[h,d,a,mmatrix,qualityPlifs] =\
#############################################################################################
# Prediction
#############################################################################################
-
- self.plog('Starting prediction...\n')
+
+ if not self.qpalma_debug_mode:
+ self.plog('Starting prediction...\n')
donSP = self.run['numDonSuppPoints']
accSP = self.run['numAccSuppPoints']
# we take the first quality vector of the tuple of quality vectors
quality_index = 0
+ # fetch the data needed
+ g_dir = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+ acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+ don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+ g_fmt = 'chr%d.dna.flat'
+ s_fmt = 'contig_%d%s'
+
+ num_chromo = 6
+
+ accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+ seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
# beginning of the prediction loop
for example_key in prediction_set.keys():
print 'Current example %d' % example_key
for example in prediction_set[example_key]:
- currentSeqInfo,original_read,currentQualities = example
+ currentSeqInfo,read,currentQualities = example
id,chromo,strand,genomicSeq_start,genomicSeq_stop =\
currentSeqInfo
- # remove brackets from the original read annotation
- read = unbracket_seq(original_read)
-
- if strand == '-':
- read = reverse_complement(read)
-
- self.plog('Loading example id: %d...\n'% int(id))
+ if not self.qpalma_debug_mode:
+ self.plog('Loading example id: %d...\n'% int(id))
if run['mode'] == 'normal':
quality = [40]*len(read)
quality = [40]*len(read)
try:
- currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
+ currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
except:
problem_ctr += 1
continue
current_prediction = self.calc_alignment(currentDNASeq, read,\
quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
- current_prediction['id'] = id
- #current_prediction['start_pos'] = up_cut
- current_prediction['start_pos'] = genomicSeq_start
- current_prediction['chr'] = chromo
- current_prediction['strand'] = strand
+ current_prediction['id'] = id
+ current_prediction['chr'] = chromo
+ current_prediction['strand'] = strand
+ current_prediction['start_pos'] = genomicSeq_start
allPredictions.append(current_prediction)
+ if not self.qpalma_debug_mode:
+ self.finalizePrediction(allPredictions,problem_ctr)
+ else:
+ return allPredictions
+
+
+ def finalizePrediction(self,allPredictions,problem_ctr):
# end of the prediction loop we save all predictions in a pickle file and exit
- cPickle.dump(allPredictions,open('%s.predictions.pickle'%(set_name),'w+'))
+ cPickle.dump(allPredictions,open('%s.predictions.pickle'%(self.set_name),'w+'))
print 'Prediction completed'
self.plog('Prediction completed\n')
mes = 'Problem ctr %d' % problem_ctr
# old code removed
newSpliceAlign = newSpliceAlign.reshape(1,dna_len)
newWeightMatch = newWeightMatch.reshape(1,mm_len)
- true_map = [0]*2
+ true_map = [0]*2
true_map[0] = 1
- pathNr = 0
+ pathNr = 0
_newSpliceAlign = array.array('B',newSpliceAlign.flatten().tolist()[0])
_newEstAlign = array.array('B',newEstAlign.flatten().tolist()[0])
alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, read_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
- dna_array = array.array('B',dna_array)
- read_array = array.array('B',read_array)
-
- #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
- #self.plog(line1+'\n')
- #self.plog(line2+'\n')
- #self.plog(line3+'\n')
+ dna_array = array.array('B',dna_array)
+ read_array = array.array('B',read_array)
newExons = self.calculatePredictedExons(newSpliceAlign)
current_prediction = {'predExons':newExons, 'dna':dna, 'read':read, 'DPScores':newDPScores,\
- 'alignment':alignment,\
- 'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
+ 'alignment':alignment,'spliceAlign':_newSpliceAlign,'estAlign':_newEstAlign,\
'dna_array':dna_array, 'read_array':read_array }
return current_prediction
newExons.append(pos)
return newExons
-
-
-###########################
-# A simple command line
-# interface
-###########################
-
-if __name__ == '__main__':
- assert len(sys.argv) == 4
-
- run_fn = sys.argv[1]
- dataset_fn = sys.argv[2]
- param_fn = sys.argv[3]
-
- run_obj = cPickle.load(open(run_fn))
- dataset_obj = cPickle.load(open(dataset_fn))
-
- qpalma = QPalma()
-
- if param_fn == 'train':
- qpalma.train(run_obj,dataset_obj)
- else:
- param_obj = cPickle.load(open(param_fn))
- qpalma.predict(run_obj,dataset_obj,param_obj)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
+
#
# This file contains the main interface to the QPalma pipeline.
#
--- /dev/null
+from distutils.core import setup, Extension
+
+
+setup (name = 'QPalma'
+ description = 'A package for optimal alignment of short reads',
+ version = '1.0',
+ long_description = '''
+ A
+ ''',
+ author = 'Fabio De Bona',
+ author_email = 'fabio@tuebingen.mpg.de',
+ url = 'http://',
+ license = 'GNU GPL version 3',
+ ext_package = "qpalma",
+ #ext_modules = extmods,
+ package_dir = {"qpalma": "python"},
+ packages = ["qpalma"])
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-import pdb
-import unittest
+import cPickle
+import math
import numpy
import os.path
-import cPickle
+import pdb
+import unittest
from qpalma_main import QPalma
-from Utils import pprint_alignment
-from qpalma.sequence_utils import alphabet
+from Utils import print_prediction
+
+from createAlignmentFileFromPrediction import alignment_reconstruct
jp = os.path.join
This class...
"""
+ def _setUp(self):
+ data_fn = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset/prediction_set.pickle'
+
+ self.prediction_set = cPickle.load(open(data_fn))
+
+
+
def setUp(self):
+ print
self.prediction_set = {}
- read = 'catctcacagtcttcttcttcttctcgattgcagtagc'
- currentQualities = [[40]*len(read)]
+ # xxxxxxxxxxx_______________________________________________________________________xxxxxxxxxxxxxxxxxxxxxxxxxxx
+ # 'tattttggaaggtatttcatctctccgacattgctctcaacactgtcccctccaatgcctagtccttttatttttttcttagttccaattcccttaaatacatctcacagtcttcttcttcttctcgattgcagtagc'
+
+ read = 'tattttggaagttccaattcccttaatacatctcacag'
+ currentQualities = [[30]*len(read)]
id = 1
chromo = 3
tsize = 23470805
- genomicSeq_start = tsize - 2048
- genomicSeq_stop = tsize - 1990
+ genomicSeq_start = tsize - (2038+10)
+ genomicSeq_stop = tsize - 1900 + 10
+ print 'Position: ',
+ print genomicSeq_start,genomicSeq_stop
currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
example = (currentSeqInfo,read,currentQualities)
-
self.prediction_set[id] = [example]
+ # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxx
+ # 'gctactgcaatcgagaagaagaagaagactgtgagatgtatttaagggaattggaactaagaaaaaaataaaaggactaggcattggaggggacagtgttgagagcaatgtcggagagatgaaataccttccaaaata'
+
+ read = 'gctactgcaatcgagaagaagaagaagactatgaaata'
+ currentQualities = [[40]*30+[22]*8]
+
+ id = 2
+ chromo = 3
+ strand = '+'
+
+ tsize = 23470805
+
+ genomicSeq_start = tsize - (2038+10)
+ genomicSeq_stop = tsize - 1900 + 10
+ print 'Position: ',
+ print genomicSeq_start,genomicSeq_stop
+
+ currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
+
+ example = (currentSeqInfo,read,currentQualities)
+ self.prediction_set[id] = [example]
def testAlignments(self):
qp = QPalma(True)
#qp.init_prediction(run,set_name)
allPredictions = qp.predict(run,self.prediction_set,param)
- for elem in allPredictions:
- dna_array = elem['dna_array']
- read_array = elem['read_array']
-
- dna = map(lambda x: alphabet[x],dna_array)
- read = map(lambda x: alphabet[x],read_array)
-
- spliceAlign = elem['spliceAlign']
- estAlign = elem['estAlign']
-
- line1,line2,line3 = pprint_alignment(spliceAlign, estAlign, dna, read)
- print line1
- print line2
- print line3
+ for current_prediction in allPredictions:
+ align_str = print_prediction(current_prediction)
+ print align_str
+
+ id = current_prediction['id']
+ seq = current_prediction['read']
+ dna = current_prediction['dna']
+ chromo = current_prediction['chr']
+ strand = current_prediction['strand']
+ start_pos = current_prediction['start_pos']
+ predExons = current_prediction['predExons']
+
+ numExons = int(math.ceil(len(predExons) / 2))
+
+ print alignment_reconstruct(current_prediction,numExons)
+ #print id,start_pos,predExons
if __name__ == '__main__':
// The purpose of this program is to read a gff and a
// solexa reads file and create a data set used by QPalma.
//
-//
-//
// Notes:
//
// - Both the read indices and the gff gene indices are one-based
//
-//
////////////////////////////////////////////////////////////////////////////////
#include <sys/mman.h>
//#define _FDEBUG_ 0
//#define DEBUG_READ 38603
-
-//
/*
* TODO:
* - Check strand -> done simple (only if equal)
}
}
+ exit(0);
free(up_used_flag);
free(down_used_flag);
}
int u_off = p_start - up_read_start;
int d_off = exon_start - down_read_start;
- FA(u_off >= 0 && d_off >= 0);
+ FA( u_off >= 0 && d_off >= 0 );
FA( exon_stop - p_start + p_stop - exon_start + 2 == read_size);
FA( u_size + d_size == read_size );
if ( current_strand == 'P' ) {
printf("flipping read sequences...\n");
printf("%s %s\n",up_read->seq,down_read->seq);
-
- char *tmp = malloc(sizeof(char)*strlen(up_read->seq+1));
+ char *tmp = malloc(sizeof(char)*(strlen(up_read->seq)+1));
strncpy(tmp,up_read->seq,strlen(up_read->seq));
tmp[strlen(up_read->seq)]='\0';
- realloc(up_read->seq,sizeof(char)*strlen(down_read->seq+1));
+
+ realloc(up_read->seq,sizeof(char)*(strlen(down_read->seq)+1));
strncpy(up_read->seq,down_read->seq,strlen(down_read->seq));
up_read->seq[strlen(down_read->seq)] = '\0';
- realloc(down_read->seq,sizeof(char)*strlen(tmp));
+
+ if (up_read->seq == NULL)
+ perror("realloc\n");
+
+ realloc(down_read->seq,sizeof(char)*strlen(tmp)+1);
strncpy(down_read->seq,tmp,strlen(tmp));
down_read->seq[strlen(tmp)] = '\0';
-
+
+ if (down_read->seq == NULL)
+ perror("realloc\n");
+
free(tmp);
-
printf("flipping done...\n");
printf("%s %s\n",up_read->seq,down_read->seq);
}
-
+
printf("start joining...\n");
Tuple jinfo = join_seq(new_seq,up_read->seq,u_off,u_size,down_read->seq,d_off,d_size,down_range);
printf("end of joining...\n");
+ printf("new_seq is %s\n",new_seq);
+
+
int cut_pos = jinfo.first;
int additional_pos = jinfo.second;
printf("jinfo contains %d/%d\n",jinfo.first,jinfo.second);
char* new_prb = malloc(sizeof(char)*buf_size);
char* new_cal_prb = malloc(sizeof(char)*buf_size);
char* new_chastity = malloc(sizeof(char)*buf_size);
+
if (new_prb == NULL || new_cal_prb == NULL || new_chastity == NULL)
perror("malloc\n");
-
- if( jinfo.first == -1 ) {
- retval = 0;
- goto free;
- }
-
- if( additional_pos > (down_range - d_size) ) {
+
+ if( jinfo.first == -1 || (additional_pos > (down_range - d_size)) ) {
retval = 0;
goto free;
}
printf("joining qualities...\n");
strncpy(new_prb, up_read->prb+u_off, u_size);
strncpy(new_prb+u_size, down_read->prb+d_off, d_size+additional_pos);
- new_prb[buf_size] = '\0';
+ new_prb[buf_size-1] = '\0';
strncpy(new_cal_prb, up_read->cal_prb+u_off, u_size);
strncpy(new_cal_prb+u_size, down_read->cal_prb+d_off, d_size+additional_pos);
- new_cal_prb[buf_size] = '\0';
+ new_cal_prb[buf_size-1] = '\0';
strncpy(new_chastity, up_read->chastity+u_off, u_size);
strncpy(new_chastity+u_size, down_read->chastity+d_off, d_size+additional_pos);
- new_chastity[buf_size] = '\0';
+ new_chastity[buf_size-1] = '\0';
printf("end of joining qualities...\n");
//printf("old reads: %s %s (%d %d %d/%d)\n",up_read->seq,down_read->seq,up_read->pos,down_read->pos,u_off,d_off);
//printf("new read: %s %d %d\n",new_seq,cut_pos,u_size);
-
//if ( current_strand == 'P' ) {
// int alpha = read_size - u_off - 1;
// int beta = alpha - u_size ;
// p_start = up_read->pos + beta + 1;
// exon_stop = up_read->pos + alpha;
-
// alpha = read_size - d_off - 1;
// beta = alpha - (read_size - u_size);
// exon_start = down_read->pos + beta + 1;
if (idx == len || temp_idx == -1)
break;
}
+ free(temp);
}
*
*/
-
int main(int argc, char* argv[]) {
if(argc != 8) {
if (! (old_gene_stop < currentGene->start ) ) {
printf("Gene %s is overlapping\n",currentGene->id);
old_gene_stop = currentGene->stop;
+ free_gene(allGenes[g_idx]);
+ free(allGenes[g_idx]);
allGenes[g_idx] = 0;
nulled_genes++;
continue;
#SRCS= parser.c\
#OBJS = $(SRCS:%.cpp=%.o)
-all: $(OBJS)
+all: compare.c
+ @ echo "Compiling..."
@ $(CC) $(CFLAGS) -o compare compare.c
#@ $(CC) $(CFLAGS) -o filterReads $(OBJS) filterReads.c
strncpy(pred_fn,argv[2],strlen(argv[2]));
strncpy(result_fn,argv[3],strlen(argv[3]));
- FILE *result_fs = fopen(result_fn,"r");
+ FILE *result_fs = fopen(result_fn,"w+");
if(result_fs == NULL) {
printf("Error: Could not open file: %s",result_fn);
exit(EXIT_FAILURE);
import qpalma.tools
from qpalma.parsers import *
-from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
import QPalmaConfiguration as Conf
self.read_size = Conf.read_size
+ self.half_window_size = 1500
+
def saveAs(self,dataset_file):
dataset_fn = '%s.pickle'% dataset_file
dataset_keys_fn = '%s.keys.pickle'%dataset_file
- print dataset_fn
- print dataset_keys_fn
+ print dataset_fn, dataset_keys_fn
assert not os.path.exists(dataset_fn), 'The data_file already exists!'
assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
- def parse_map_file(self,dataset,map_file):
- strand_map = ['-','+']
+ def parse_map_file(self,dataset,map_file,first_round):
instance_counter = 0
g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+ not_consistent_ctr = 0
+
for line in open(map_file):
if instance_counter > 0 and instance_counter % 5000 == 0:
print 'processed %d examples' % instance_counter
- #if instance_counter == 10000:
+
+ if instance_counter == 10000:
+ print 'Not consistent ctr: %d' % not_consistent_ctr
# break
slist = line.split()
chromo = int(slist[1])
pos = int(slist[2])
- # convert D/P strand info to +/-
- strand = slist[3]
- strand = strand_map[strand == 'D']
-
- read_seq = slist[4]
-
- # QPalma uses lowercase characters
- read_seq = read_seq.lower()
- read_seq = unbracket_seq(read_seq)
-
- # fetch the prb quality vector
- _prb = slist[5]
-
# for the time being we only consider chromosomes 1-5
if not chromo in range(1,6):
continue
- if strand == '-':
- pos = seqInfo.chromo_sizes[chromo]-pos-self.read_size
+ # convert D/P strand info to +/-
+ if slist[3] == 'D':
+ strand = '+'
+ else:
+ strand = '-'
+
+ # QPalma uses lowercase characters
+ bracket_seq = slist[4].lower()
+ read_seq = unbracket_seq(bracket_seq)
+ reconstructed_dna_seq = reconstruct_dna_seq(bracket_seq)
+
+ if strand == '-' and first_round:
read_seq = reverse_complement(read_seq)
+ #reconstructed_dna_seq = reverse_complement(reconstructed_dna_seq)
- # we use an area of +/- 1500 nucleotides around the seed position
- if pos > 1501:
- ds_offset = 1500
+ assert not '[' in read_seq and not ']' in read_seq
+
+ # we use an area of +/- `self.half_window_size` nucleotides around the seed position
+ if pos > self.half_window_size+1:
+ us_offset = self.half_window_size
else:
- ds_offset = pos-1
+ us_offset = pos - 1
- us_offset = 1500
+ if pos+self.half_window_size < seqInfo.chromo_sizes[chromo]:
+ ds_offset = self.half_window_size
+ else:
+ ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
- genomicSeq_start = pos - ds_offset
- genomicSeq_stop = pos + us_offset
+ genomicSeq_start = pos - us_offset
+ genomicSeq_stop = pos + ds_offset
- #try:
dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
- #except:
+
+ #if strand == '-':
+ # dna_part = dna_seq[us_offset-len(read_seq):us_offset]
+ #else:
+ # dna_part = dna_seq[us_offset:us_offset+len(read_seq)]
+
+ #if reconstructed_dna_seq != dna_part:
+ # not_consistent_ctr += 1
# pdb.set_trace()
- # Take into account that the results of the second run do not contain
- # the original reads anymore.
- #if first_map:
- # original_read = read_seq
- #else:
- # original_read = create_bracket_seq(dna_seq[ds_offset:ds_offset+self.read_size],read_seq)
+ ## check whether everything is ok
+ # try:
+ # currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
+ # except:
+ # problem_ctr += 1
+ # continue
- # in order to save some space we use a signed char to store the
+ # In order to save some space we use a signed char to store the
# qualities. Each quality element can range as follows: -128 <= elem <= 127
- prb = array.array('b',map(lambda x: ord(x)-64,_prb))
-
- #pdb.set_trace()
+ prb = array.array('b',map(lambda x: ord(x)-64,slist[5]))
# add instance to set
currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
currentQualities = [prb]
- # as one single read can have several vmatch matches we store all
+ # As one single read can have several vmatch matches we store all
# these matches under the unique id of the read
-
- #dataset.setdefault(id, []).append((currentSeqInfo,original_read,currentQualities))
dataset.setdefault(id, []).append((currentSeqInfo,read_seq,currentQualities))
instance_counter += 1
print 'Total examples processed: %d' % instance_counter
+ print 'Not consistent ctr: %d' % not_consistent_ctr
return dataset
# usually we have two files to parse:
# the map file from the second run and a subset of the map file from the
# first run
- dataset = self.parse_map_file(dataset,self.map_file)
- dataset = self.parse_map_file(dataset,self.map_2nd_file)
+
+ dataset = self.parse_map_file(dataset,self.map_file,True)
+ dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
self.dataset = dataset