+ optimized parsers
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 21:39:31 +0000 (21:39 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 21:39:31 +0000 (21:39 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8695 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/parsers.py

index 7bed1f2..60daf9b 100644 (file)
@@ -2,6 +2,9 @@
 # -*- coding: utf-8 -*-
 
 import pdb
+import os
+import sys
+import mmap
 
 class ReadParser:
    """
@@ -25,66 +28,92 @@ class ReadParser:
          
       return lines
 
-class FilteredReadParser(ReadParser):
-   """
-   This class offers a parser for the reads that are created by the first
-   filtering step performed to cut and join new reads.
-   """
-
-   def __init__(self,filename):
-      ReadParser.__init__(self,filename)
-
-   def parseLine(self,line):
-      """
-      We assume that a line has the following entries:
-      
-      read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
-
-      """
-      try:
-         id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
-      except:
-         id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-         true_cut = -1
-
-      splitpos = int(splitpos)
-      read_size = int(read_size)
-
-      seq=seq.lower()
-
-      assert strand in ['D','P']
-
-      if strand == 'D':
-         strand = '+'
+#class FilteredReadParser(ReadParser):
+#   """
+#   This class offers a parser for the reads that are created by the first
+#   filtering step performed to cut and join new reads.
+#   """
+#
+#   def __init__(self,filename):
+#      ReadParser.__init__(self,filename)
+#
+#   def parseLine(self,line):
+#      """
+#      We assume that a line has the following entries:
+#      
+#      read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
+#
+#      """
+#      try:
+#         id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
+#      except:
+#         id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+#         true_cut = -1
+#
+#      splitpos = int(splitpos)
+#      read_size = int(read_size)
+#
+#      seq=seq.lower()
+#
+#      assert strand in ['D','P']
+#
+#      if strand == 'D':
+#         strand = '+'
+#
+#      if strand == 'P':
+#         strand = '-'
+#
+#      chr = int(chr)
+#
+#      prb = [ord(elem)-50 for elem in prb]
+#      cal_prb = [ord(elem)-64 for elem in cal_prb]
+#      chastity = [ord(elem)+10 for elem in chastity]
+#
+#      p_start = int(p_start)
+#      exon_stop = int(exon_stop)
+#      exon_start = int(exon_start)
+#      p_stop = int(p_stop)
+#      true_cut = int(true_cut)
+#
+#      line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
+#      'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
+#      'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
+#      'p_stop':p_stop,'true_cut':true_cut}
+#
+#      return line_d
+#
+
+
+
+def parse(filename):
+   entries = []
+   
+   fh = open(filename)
 
-      if strand == 'P':
-         strand = '-'
+   all_lines = fh.read()
+   all_lines = all_lines.split('\n')
 
-      chr = int(chr)
+   for parsed_line in all_lines:
+      if parsed_line == '':
+         continue
 
-      prb = [ord(elem)-50 for elem in prb]
-      cal_prb = [ord(elem)-64 for elem in cal_prb]
-      chastity = [ord(elem)+10 for elem in chastity]
+      id,chr,pos,strand,mismatches,length,offset,seq,prb,cal_prb,chastity =\
+      parsed_line.split()
+      chr   = int(chr)
+      pos   = int(pos)
 
-      p_start = int(p_start)
-      exon_stop = int(exon_stop)
-      exon_start = int(exon_start)
-      p_stop = int(p_stop)
-      true_cut = int(true_cut)
+      mismatches  = int(mismatches)
+      length      = int(length)
+      offset      = int(offset)
 
-      line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
-      'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
-      'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
-      'p_stop':p_stop,'true_cut':true_cut}
+      line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
+      'mismatches':mismatches, 'length':length, 'offset':offset,\
+      'seq':seq,'prb':prb,'cal_prb':cal_prb,'chastity':chastity}
 
-      return line_d
+      entries.append((line_d,parsed_line))
 
-   def next(self):
-      for line in self.fh:
-         line = line.strip()
-         yield self.parseLine(line)
+   return entries
 
-      raise StopIteration
 
    def parse(self):
       entries = {}
@@ -221,79 +250,186 @@ class MapParser(ReadParser):
       return entries
 
 
-class PipelineReadParser(ReadParser):
+
+def parse_map_vm_heuristic(filename):
    """
-   This class offers a parser for the reads that are remapped by the vmatch
-   utility. 
+   This function parse the map.vm file for the PipelineHeuristics
+   """
+   entries = []
+   
+   fh = open(filename)
 
-   According to the docu the entries are:
+   all_lines = fh.read()
+   all_lines = all_lines.split('\n')
+
+   for parsed_line in all_lines:
+      if parsed_line == '':
+         continue
+
+      id,chr,pos,strand,mismatches,length,offset,seq,prb,cal_prb,chastity =\
+      parsed_line.split()
+      chr   = int(chr)
+      pos   = int(pos)
+
+      mismatches  = int(mismatches)
+      length      = int(length)
+      offset      = int(offset)
+
+      line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
+      'mismatches':mismatches, 'length':length, 'offset':offset,\
+      'seq':seq,'prb':prb,'cal_prb':cal_prb,'chastity':chastity}
+
+      entries.append((line_d,parsed_line))
+
+   return entries
+
+
+def parse_filtered_reads(filename):
+   """
    
-   ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
+   """
+   entries = {}
+   
+   #fh = open(filename)
+   #all_lines = fh.read()
+   #all_lines = all_lines.split('\n')
+
+   #for parsed_line in all_lines:
+   #   if parsed_line == '':
+   #      continue
+
+   data = map_file(filename)
+   
+   while True:
+
+      parsed_line = data.readline().strip()
+
+      if parsed_line == '':
+         break
+
+      id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = parsed_line.split()
+
+      id = int(id)
+      splitpos = int(splitpos)
+      read_size = int(read_size)
+
+      seq=seq.lower()
+
+      assert strand in ['D','P']
+
+      if strand == 'D':
+         strand = '+'
+
+      if strand == 'P':
+         strand = '-'
+
+      chr = int(chr)
+
+      prb = [ord(elem)-50 for elem in prb]
+      cal_prb = [ord(elem)-64 for elem in cal_prb]
+      chastity = [ord(elem)+10 for elem in chastity]
 
+      p_start = int(p_start)
+      exon_stop = int(exon_stop)
+      exon_start = int(exon_start)
+      p_stop = int(p_stop)
+      true_cut = int(true_cut)
+
+      line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
+      'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
+      'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
+      'p_stop':p_stop,'true_cut':true_cut}
+
+      id = line_d['id']
+      assert id not in entries, pdb.set_trace()
+      entries[id] = line_d
+
+   return entries
+
+
+def parse_map_vm(filename):
    """
+   Parses the map.vm files from Vmatch
 
-   def __init__(self,filename):
-      ReadParser.__init__(self,filename)
 
-   def parseLine(self,line):
-      """
-      We assume that a line has the following entries:
+   For an original read we can have several vmatch locations
+   """
+   entries = {}
+   
+   #fh = open(filename)
+   #all_lines = fh.read()
+   #all_lines = all_lines.split('\n')
 
-      """
+   #for parsed_line in all_lines:
+   #   if parsed_line == '':
+   #      continue
+
+   data = map_file(filename)
+   
+   while True:
+
+      parsed_line = data.readline()
+
+      if parsed_line == '':
+         break
+
+
+      id,chr,pos,strand,mismatches,length,offset,seq,q1,q2,q3 = parsed_line.split()
 
-      #id,chr,pos,strand,mismatches,length,offset,seq,prb,cal_prb,chastity,orig_seq,is_spliced = line.split()
-      id,chr,pos,strand,mismatches,length,offset,seq,prb,cal_prb,chastity = line.split()
+      id = int(id)
 
       chr   = int(chr)
       pos   = int(pos)
 
+      if strand == 'D':
+         strand = '+'
+
+      if strand == 'P':
+         strand = '-'
+
       mismatches  = int(mismatches)
       length      = int(length)
       offset      = int(offset)
 
       seq=seq.lower()
-      #orig_seq=orig_seq.lower()
-
-      #if is_spliced == '1': 
-      #   is_spliced = True
-      #else:
-      #   is_spliced = False
 
       line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
-      'mismatches':mismatches, 'length':length, 'offset':offset,\
-      'seq':seq,'prb':prb,'cal_prb':cal_prb,'chastity':chastity}
+      'mismatches':mismatches, 'length':length, 'offset':offset}
+   
+      try:
+         entries[id] = [line_d]
+      except:
+         old_entry = entries[id]
+         old_entry.append(line_d)
+         entries[id] = old_entry
 
-      return line_d
+   return entries
 
 
-   def parse(self):
-      entries = []
 
-      all_lines = self.fh.read()
-      all_lines = all_lines.split('\n')
+def map_file(filename):
+   """
+   As the mmapped file object behaves just like plain file objects
+   you do not have to unmap but just close the file.
+   """
 
-      for parsed_line in all_lines:
-         if parsed_line == '':
-            continue
+   #filename = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/map.vm'
 
-      #for parsed_line in self.fh:
-         id,chr,pos,strand,mismatches,length,offset,seq,prb,cal_prb,chastity =\
-         parsed_line.split()
-         chr   = int(chr)
-         pos   = int(pos)
+   file = open(filename, "r+")
+   size = os.path.getsize(filename)
 
-         mismatches  = int(mismatches)
-         length      = int(length)
-         offset      = int(offset)
+   data = mmap.mmap(file.fileno(), size)
+   assert size == len(data), 'Could not map whole file at once!'
 
-         line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
-         'mismatches':mismatches, 'length':length, 'offset':offset,\
-         'seq':seq,'prb':prb,'cal_prb':cal_prb,'chastity':chastity}
+   return data
 
-         #line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
-         #'mismatches':mismatches, 'length':length, 'offset':offset,\
-         #'seq':seq,'prb':prb,'cal_prb':cal_prb,'chastity':chastity}
 
-         entries.append((line_d,parsed_line))
+"""
+while True:
+   line = data.readline()
 
-      return entries
+   if line == '':
+      break
+
+   print line
+"""