import math
import numpy
-#from qpalma.parsers import *
-
-from Evaluation import load_chunks
import qparser
-
-def create_aglignment_file(allPredictions,out_fname):
+def create_alignment_file(current_loc,out_fname):
import numpy
#qparser.parse_reads(filtered_reads)
out_fh = open(out_fname,'w+')
- #allPredictionsmllPredictions = load_chunks(current_loc)
- #allPredictions = cPickle.load(open(current_loc))
+ allPredictions = cPickle.load(open(current_loc))
allPositions = {}
start_pos = current_prediction['start_pos']
alignment = current_prediction['alignment']
+ DPScores = current_prediction['DPScores']
+
predExons = current_prediction['predExons']
predExons = [e+start_pos for e in predExons]
- #p_start = current_ground_truth['p_start']
- #e_stop = current_ground_truth['exon_stop']
- #e_start = current_ground_truth['exon_start']
- #p_stop = current_ground_truth['p_stop']
-
- # &match, &mismatch, &repmatch, &Ns, &Qgapcnt, &Qgapbs, &Tgapcnt, &Tgapbs,
- predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
- #predExons = predExons.T
-
- #print predExons
- #pdb.set_trace()
-
- match = predExons[0,1] - predExons[0,0]
- if predExons.shape == (2,2):
- match += predExons[1,1] - predExons[1,0]
-
- mismatch = 0
- repmatch = 0
- Ns = 0
- Qgapcnt = 0
- Qgapbs = 0
-
- Tgapcnt = 0
- Tgapbs = 0
-
- if predExons.shape == (2,2):
- Tgapbs += predExons[1,0] - predExons[0,1]
- Tgapcnt = 1
-
- # &strand, Qname, &Qsize,
-
- # &Qstart, &Qend, Tname, &Tsize,
- # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
tExonSizes,tStarts, tEnds) = alignment
- #
- # ATTENTION
- #
- # we enforce equal exons sizes for q and t because we are missing indel
- # positions of the alignment
-
- if qExonSizes[0] != tExonSizes[0]:
- continue
-
- Qname = str(id)
- Qsize = len(seq)
- Qstart = qStart
- Qend = qEnd
- Tname = 'CHR%d'%chromo
-
- start_pos -= 2
-
- Tsize = tEnd+1 + start_pos
- Tstart = tStart + start_pos
- Tend = tEnd + start_pos
- blockcnt = Tgapcnt+1
- exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
- Qstarts_str = str(qStarts)[1:-1].replace(' ','')
- Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
-
- #new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
- #(id,chr,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
- #str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
- #str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
- #str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
-
- new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
- % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
- Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
- Tname, Tsize, Tstart, Tend,\
- blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
-
+
+ EST2GFF = False
+ if EST2GFF:
+ predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
+
+ match = predExons[0,1] - predExons[0,0]
+ if predExons.shape == (2,2):
+ match += predExons[1,1] - predExons[1,0]
+
+ mismatch = 0
+ repmatch = 0
+ Ns = 0
+ Qgapcnt = 0
+ Qgapbs = 0
+
+ Tgapcnt = 0
+ Tgapbs = 0
+
+ if predExons.shape == (2,2):
+ Tgapbs += predExons[1,0] - predExons[0,1]
+ Tgapcnt = 1
+
+ # &strand, Qname, &Qsize,
+ # &Qstart, &Qend, Tname, &Tsize,
+ # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
+
+ #
+ # ATTENTION
+ #
+ # we enforce equal exons sizes for q and t because we are missing indel
+ # positions of the alignment
+
+ if qExonSizes[0] != tExonSizes[0]:
+ continue
+
+ Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
+ Qsize = len(seq)
+ Qstart = qStart
+ Qend = qEnd
+ Tname = 'CHR%d'%chromo
+
+ start_pos -= 2
+
+ Tsize = tEnd+1 + start_pos
+ Tstart = tStart + start_pos
+ Tend = tEnd + start_pos
+ blockcnt = Tgapcnt+1
+ exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
+ Qstarts_str = str(qStarts)[1:-1].replace(' ','')
+ Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
+
+ try:
+ new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+ (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+ str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
+ str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
+ str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
+ except:
+ pdb.set_trace()
+
+ #new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
+ #% (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
+ #Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
+ #Tname, Tsize, Tstart, Tend,\
+ #blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
out_fh.write(new_line)
if __name__ == '__main__':
current_dir = sys.argv[1]
out_fh = sys.argv[2]
- create_aglignment_file(current_dir,out_fh)
+ create_alignment_file(current_dir,out_fh)
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import cPickle
+import sys
+import pdb
+import os
+import os.path
+import math
+
+from pythongrid import Job, KybJob, MethodJob, processJobs, Usage, processJobsLocally
+
+from createAlignmentFileFromPrediction import *
+
+import grid_alignment
+
+from Utils import split_file_join_results
+
+
+def g_alignment(chunk_fn,result_fn):
+ create_alignment_file(chunk_fn,result_fn)
+
+
+def create_and_submit():
+ jp = os.path.join
+
+ run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
+ data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+
+ chunks_fn = []
+ for fn in os.listdir(run_dir):
+ if fn.startswith('chunk'):
+ chunks_fn.append(fn)
+
+ functionJobs=[]
+
+ for chunk_fn in chunks_fn:
+ chunk_name = chunk_fn[:chunk_fn.find('.')]
+ result_fn = jp(data_dir,'%s.align_remap'%chunk_name)
+ chunk_fn = jp(run_dir,chunk_fn)
+
+ #pdb.set_trace()
+
+ current_job = KybJob(grid_alignment.g_alignment,[chunk_fn,result_fn])
+ current_job.h_vmem = '15.0G'
+ #current_job.express = 'True'
+
+ print "job #1: ", current_job.nativeSpecification
+
+ functionJobs.append(current_job)
+
+ processedFunctionJobs = processJobs(functionJobs)
+ print "ret fields AFTER execution on cluster"
+ for (i, job) in enumerate(processedFunctionJobs):
+ print "Job with id: ", i, "- ret: ", job.ret
+
+
+if __name__ == '__main__':
+ #split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)
+ create_and_submit()