from qpalma.TrainingParam import Param
from qpalma.Plif import Plf
-#from qpalma.tools.splicesites import getDonAccScores
from qpalma.Configuration import *
-#from compile_dataset import getSpliceScores, get_seq_and_scores
-
from numpy.matlib import mat,zeros,ones,inf
from numpy import inf,mean
from qpalma.parsers import *
+from ParaParser import *
+
from Lookup import *
from qpalma.sequence_utils import *
+
+class BracketWrapper:
+ fields = ['id', 'chr', 'pos', 'strand', 'mismatches', 'length',\
+ 'offset', 'seq', 'prb', 'cal_prb', 'chastity']
+
+ def __init__(self,filename):
+ self.parser = ParaParser("%lu%d%d%s%d%d%d%s%s%s%s",self.fields,len(self.fields),IN_VECTOR)
+ self.parser.parseFile(filename)
+
+ def __len__(self):
+ return self.parser.getSize(IN_VECTOR)
+
+ def __getitem__(self,key):
+ return self.parser.fetchEntry(key)
+
+ def __iter__(self):
+ self.counter = 0
+ self.size = self.parser.getSize(IN_VECTOR)
+ return self
+
+ def next(self):
+ if not self.counter < self.size:
+ raise StopIteration
+ return_val = self.parser.fetchEntry(self.counter)
+ self.counter += 1
+ return return_val
+
+
class PipelineHeuristic:
"""
This class wraps the filter which decides whether an alignment found by
vmatch is spliced an should be then newly aligned using QPalma or not.
"""
- def __init__(self,run_fname,data_fname,reads_pipeline_fn,param_fname,result_spliced_fname,result_unspliced_fname):
+ def __init__(self,run_fname,data_fname,param_fname,result_fname):
"""
We need a run object holding information about the nr. of support points
etc.
dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+
start = cpu()
- self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
+
+ # old version
+ #self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
+ self.all_remapped_reads = BracketWrapper(data_fname)
+
stop = cpu()
print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
stop = cpu()
print 'prefetched sequence and splice data in %f sec' % (stop-start)
- self.result_spliced_fh = open(result_spliced_fname,'w+')
- self.result_unspliced_fh = open(result_unspliced_fname,'w+')
+ self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
+ self.result_unspliced_fh = open('%s.unspliced'%result_fname,'w+')
start = cpu()
self.mmatrix = mmatrix
self.qualityPlifs = qualityPlifs
- self.read_size = 36
+ #self.read_size = 36
# parameters of the heuristics to decide whether the read is spliced
#self.splice_thresh = 0.005
self.result_spliced_fh.write(p_line)
self.result_unspliced_fh.write(p_line)
- self.original_reads = {}
+ #self.original_reads = {}
- for line in open(reads_pipeline_fn):
- line = line.strip()
- id,seq,q1,q2,q3 = line.split()
- id = int(id)
- self.original_reads[id] = seq
+ # we do not have this information
+ #for line in open(reads_pipeline_fn):
+ # line = line.strip()
+ # id,seq,q1,q2,q3 = line.split()
+ # id = int(id)
+ # self.original_reads[id] = seq
lengthSP = run['numLengthSuppPoints']
donSP = run['numDonSuppPoints']
strand = strand_map[strand]
- if strand == '-':
+ #if strand == '-':
+ # continue
+
+ if not chr in range(1,6):
continue
unb_seq = unbracket_est(seq)
if __name__ == '__main__':
- if len(sys.argv) != 7:
- print 'Usage: ./%s data reads param run spliced.results unspliced.results' % (sys.argv[0])
+ if len(sys.argv) != 6:
+ print 'Usage: ./%s data param run spliced.results unspliced.results' % (sys.argv[0])
+ sys.exit(1)
data_fname = sys.argv[1]
- reads_pipeline_fn = sys.argv[2]
- param_fname = sys.argv[3]
- run_fname = sys.argv[4]
+ param_fname = sys.argv[2]
+ run_fname = sys.argv[3]
- result_spliced_fname = sys.argv[5]
- result_unspliced_fname = sys.argv[6]
+ result_spliced_fname = sys.argv[4]
+ result_unspliced_fname = sys.argv[5]
jp = os.path.join
- #dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test/run_+_quality_+_splicesignals_+_intron_len'
- #param_fname = jp(dir,'param_500.pickle')
- #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/current_data/map.vm_unspliced_20k'
- #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_2k'
- #data_fname = '/fml/ag-raetsch/share/projects/qpalma/solexa/pipeline_data/map.vm_100'
- #result_spliced_fname = 'splicedReads.heuristic'
- #result_unspliced_fname = 'unsplicedReads.heuristic'
-
- ph1 = PipelineHeuristic(run_fname,data_fname,reads_pipeline_fn,param_fname,result_spliced_fname,result_unspliced_fname)
+ ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_spliced_fname,result_unspliced_fname)
start = cpu()
ph1.filter()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+import pdb
+
from numpy.matlib import mat,zeros,ones,inf
import palma.palma_utils as pu
##########
-
def split_file_join_results(filename,parts):
all_intervals = []
for line in open(filename,'r'):
line_ctr += 1
+ print 'found %d lines' % line_ctr
+
part = line_ctr / parts
begin = 0
end = 0
end = begin+part
params = (begin,end)
+ print begin,end
all_intervals.append(params)
+ pdb.set_trace()
+
infile = open(filename,'r')
parts_fn = []
out_fh.close()
-
+if __name__ == '__main__':
+ split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)
import os
import os.path
import math
-from qpalma.parsers import *
+import numpy
+
+#from qpalma.parsers import *
from Evaluation import load_chunks
+import qparser
+
-def prediction_on(current_dir,filtered_reads,out_fname):
-
- import qparser
- qparser.parse_reads(filtered_reads)
+def create_aglignment_file(allPredictions,out_fname):
+ import numpy
+ #qparser.parse_reads(filtered_reads)
out_fh = open(out_fname,'w+')
- allPredictions = load_chunks(current_dir)
+ #allPredictionsmllPredictions = load_chunks(current_loc)
+ #allPredictions = cPickle.load(open(current_loc))
+
allPositions = {}
for current_prediction in allPredictions:
#true_cut = current_ground_truth['true_cut']
#seq = current_ground_truth['seq']
#q1 = current_ground_truth['prb']
+
seq = current_prediction['est']
+ dna = current_prediction['dna']
# CHECK !!!
q1 = 'zzz'
- chr = current_prediction['chr']
+ chromo = current_prediction['chr']
strand = current_prediction['strand']
start_pos = current_prediction['start_pos']
alignment = current_prediction['alignment']
predExons = current_prediction['predExons']
predExons = [e+start_pos for e in predExons]
- p_start = current_ground_truth['p_start']
- e_stop = current_ground_truth['exon_stop']
- e_start = current_ground_truth['exon_start']
- p_stop = current_ground_truth['p_stop']
+ #p_start = current_ground_truth['p_start']
+ #e_stop = current_ground_truth['exon_stop']
+ #e_start = current_ground_truth['exon_start']
+ #p_stop = current_ground_truth['p_stop']
+
+ # &match, &mismatch, &repmatch, &Ns, &Qgapcnt, &Qgapbs, &Tgapcnt, &Tgapbs,
+ predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
+ #predExons = predExons.T
+
+ #print predExons
+ #pdb.set_trace()
+
+ match = predExons[0,1] - predExons[0,0]
+ if predExons.shape == (2,2):
+ match += predExons[1,1] - predExons[1,0]
+
+ mismatch = 0
+ repmatch = 0
+ Ns = 0
+ Qgapcnt = 0
+ Qgapbs = 0
+
+ Tgapcnt = 0
+ Tgapbs = 0
- correct_prediction = False
+ if predExons.shape == (2,2):
+ Tgapbs += predExons[1,0] - predExons[0,1]
+ Tgapcnt = 1
+
+ # &strand, Qname, &Qsize,
+
+ # &Qstart, &Qend, Tname, &Tsize,
+ # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
tExonSizes,tStarts, tEnds) = alignment
- new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
- (id,chr,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
- str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
- str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
- str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
+ #
+ # ATTENTION
+ #
+ # we enforce equal exons sizes for q and t because we are missing indel
+ # positions of the alignment
+
+ if qExonSizes[0] != tExonSizes[0]:
+ continue
+
+ Qname = str(id)
+ Qsize = len(seq)
+ Qstart = qStart
+ Qend = qEnd
+ Tname = 'CHR%d'%chromo
+
+ start_pos -= 2
+
+ Tsize = tEnd+1 + start_pos
+ Tstart = tStart + start_pos
+ Tend = tEnd + start_pos
+ blockcnt = Tgapcnt+1
+ exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
+ Qstarts_str = str(qStarts)[1:-1].replace(' ','')
+ Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
+
+ #new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+ #(id,chr,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+ #str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
+ #str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
+ #str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
+
+ new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
+ % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
+ Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
+ Tname, Tsize, Tstart, Tend,\
+ blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
+
out_fh.write(new_line)
if __name__ == '__main__':
current_dir = sys.argv[1]
- filtered_reads = sys.argv[2]
- out_fh = sys.argv[3]
- prediction_on(current_dir,filtered_reads,out_fh)
+ out_fh = sys.argv[2]
+ create_aglignment_file(current_dir,out_fh)
data_fname = jp(data_dir,'map.vm.part_%d'%idx)
result_fname = jp(data_dir,'map.vm.part_%d.heuristic'%idx)
- current_job = KybJob(grid_predict.g_heuristic,[run_fname,data_fname,param_fname,result_fname])
+ current_job = KybJob(grid_heuristic.g_heuristic,[run_fname,data_fname,param_fname,result_fname])
current_job.h_vmem = '25.0G'
#current_job.express = 'True'
for c_name,current_chunk in chunks:
current_job = KybJob(grid_predict.g_predict,[run,dataset_fn,current_chunk,param,c_name])
- current_job.h_vmem = '5.0G'
+ current_job.h_vmem = '12.0G'
#current_job.express = 'True'
print "job #1: ", current_job.nativeSpecification
run = cPickle.load(open(jp(run_dir,'run_obj.pickle')))
param = cPickle.load(open(jp(run_dir,'param_526.pickle')))
- dataset_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_12_05_08.test.pickle'
- prediction_keys_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_12_05_08.test_keys.pickle'
+ dataset_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.pickle'
+ prediction_keys_fn = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/dataset_transcriptome_run_1.keys.pickle'
prediction_keys = cPickle.load(open(prediction_keys_fn))
print 'Found %d keys for prediction.' % len(prediction_keys)
- num_splits = 15
+ num_splits = 25
slices = get_slices(len(prediction_keys),num_splits)
chunks = []
for idx,slice in enumerate(slices):
import re
import os.path
-from qpalma.sequence_utils import getSpliceScores, get_seq_and_scores
+from qpalma.sequence_utils import *
import numpy
from numpy.matlib import mat,zeros,ones,inf
pass
-def unbracket_est(est):
- new_est = ''
- e = 0
-
- while True:
- if e >= len(est):
- break
-
- if est[e] == '[':
- new_est += est[e+2]
- e += 4
- else:
- new_est += est[e]
- e += 1
-
- return "".join(new_est).lower()
-
-
def getData(training_set,exampleKey,run):
currentSeqInfo,currentExons,original_est,currentQualities = training_set[exampleKey]
id,chr,strand,up_cut,down_cut = currentSeqInfo
if '-' in est:
self.plog('found gap\n')
est = est.replace('-','')
- assert len(est) == 36
+ assert len(est) == Conf.read_size
dna_len = len(dna)
est_len = len(est)
#!/bin/bash
run_dir=/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+
-data_dir=/fml/ag-raetsch/share/projects/qpalma/solexa/new_run
+data_dir=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data
-python PipelineHeuristic.py $data_dir/map.vm $data_dir/allReads.pipeline $run_dir/param_526.pickle $run_dir/run_obj.pickle $data_dir/spliced.heuristic $data_dir/unspliced.heuristic
+python PipelineHeuristic.py $data_dir/map.vm $run_dir/param_526.pickle $run_dir/run_obj.pickle $data_dir/spliced.heuristic $data_dir/unspliced.heuristic