1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 20   
 21   
 22   
 23   
 24   
 25   
 26   
 27  __author__ = "Michael Imelfort" 
 28  __copyright__ = "Copyright 2014" 
 29  __credits__ = ["Michael Imelfort"] 
 30  __license__ = "LGPLv3" 
 31  __maintainer__ = "Michael Imelfort" 
 32  __email__ = "mike@mikeimelfort.com" 
 33   
 34   
 35   
 36   
 37  import os 
 38  import ctypes as c 
 39  from multiprocessing import Pool, Manager 
 40  import multiprocessing as mp 
 41  import numpy as np 
 42  import sys 
 43   
 44   
 45  from cWrapper import * 
 46  from bamLink import * 
 47  from bamFile import * 
 48  from bammExceptions import * 
 49   
 50   
 51   
 52   
 53   
 54   
 55   
 56   
 57   
 58   
 59   
 60   
 66      '''Single-process BAMfile parsing 
 67   
 68      cTypes pointers are unpickleable unless they are top level, so this function 
 69      lives outside the class. In this case we reduce the number of member 
 70      variables passed to it by passing the class instead. Any implicit copy 
 71      operations do not affect the workflow as it stands now. If you modify this 
 72      function you need to be aware of the limitations of python multiprocessing, 
 73      Queues, pickling and shared memory. 
 74   
 75      Extra logic is also contained in BamParser._parseOneBam 
 76   
 77      Inputs: 
 78       bAMpARSER - BamParser instance, a valid BamParser instance 
 79       parseQueue - Manager.Queue, bids (BAMs) yet to be parsed 
 80       BFI_list - Manager.List, place all processed BFIs on this list 
 81       verbose - == True -> be verbose 
 82       doContigNames - == True -> load contigs names from the C-land BFI struct 
 83      ''' 
 84      CW = CWrapper() 
 85      while True: 
 86           
 87          bid = parseQueue.get(block=True, timeout=None) 
 88          if bid is None:  
 89              break 
 90   
 91          if verbose: 
 92              print "Parsing file: %s" % bAMpARSER.bamFiles[bid] 
 93   
 94           
 95          coverages = [] 
 96          contig_lengths = [] 
 97          contig_names = [] 
 98          links = {} 
 99   
100          BFI = bAMpARSER._parseOneBam(bid) 
101   
102           
103          if bAMpARSER.doCovs or bAMpARSER.doLinks: 
104              contig_lengths = \ 
105                  np.array([int(i) for i in 
106                            c.cast(BFI.contigLengths, 
107                                   c.POINTER(c.c_uint32*BFI.numContigs)).contents 
108                            ]) 
109   
110              coverages = np.array([[float(j) for j in c.cast(i, c.POINTER(c.c_float*BFI.numBams)).contents] for i in 
111                                    c.cast(BFI.coverages, c.POINTER(c.POINTER(c.c_float*BFI.numBams)*BFI.numContigs)).contents]) 
112   
113               
114              if doContigNames: 
115                  contig_names = [] 
116                  contig_name_lengths = \ 
117                      np.array([int(i) for i in 
118                                c.cast(BFI.contigNameLengths, 
119                                       c.POINTER(c.c_uint16*BFI.numContigs) 
120                                       ).contents 
121                                ]) 
122   
123                  contig_name_array = \ 
124                      c.cast(BFI.contigNames, 
125                             c.POINTER(c.POINTER(c.c_char)*BFI.numContigs) 
126                             ).contents 
127   
128                  for i in range(BFI.numContigs): 
129                      contig_names.append((c.cast(contig_name_array[i], 
130                                                  c.POINTER(c.c_char * \ 
131                                                            contig_name_lengths[i] 
132                                                            ) 
133                                                  ).contents 
134                                           ).value 
135                                          ) 
136   
137           
138          bam_file_name = bAMpARSER.bamFiles[bid] 
139          BF = BM_bamFile(bid, bam_file_name) 
140          BF_C = \ 
141              (c.cast(BFI.bamFiles, 
142                      c.POINTER(c.POINTER(BM_bamFile_C)*1)).contents)[0].contents 
143   
144          num_types = BF_C.numTypes 
145          BTs_C = c.cast(BF_C.types, 
146                         c.POINTER(c.POINTER(BM_bamType_C)*num_types)).contents 
147   
148          for bt_c in BTs_C: 
149              BT = BM_bamType((bt_c.contents).orientationType, 
150                              (bt_c.contents).insertSize, 
151                              (bt_c.contents).insertStdev, 
152                              (bt_c.contents).supporting) 
153              BF.types.append(BT) 
154   
155          if bAMpARSER.doLinks: 
156              links = pythonizeLinks(BFI, BF) 
157          else: 
158              links = {} 
159   
160           
161          BBFI = BM_fileInfo(coverages, 
162                             contig_lengths, 
163                             BFI.numBams, 
164                             BFI.numContigs, 
165                             contig_names, 
166                             [BF], 
167                             links) 
168   
169           
170          BFI_list.append(BBFI) 
171   
172           
173          pBFI = c.POINTER(BM_fileInfo_C) 
174          pBFI = c.pointer(BFI) 
175          CW._destroyBFI(pBFI) 
176   
177          if doContigNames: 
178               
179              doContigNames = False 
 180   
182      '''Unpeel the links-associated C structs and return a python dictionary 
183      of LinkPair instances 
184   
185      Inputs: 
186       BFI - BM_fileInfo_C, C-land BamFileInfo struct 
187       bamFile - uid of the bamFile associated with the BFI 
188   
189      Outputs: 
190       A python dictionary of LinkPair instances 
191      ''' 
192      links = {} 
193      CW = CWrapper() 
194      pBFI = c.POINTER(BM_fileInfo_C) 
195      pBFI = c.pointer(BFI) 
196   
197      LW = BM_LinkWalker_C() 
198      pLW = c.POINTER(BM_LinkWalker_C) 
199      pLW = c.pointer(LW) 
200      success = CW._initLW(pLW, pBFI) 
201      if(success == 2): 
202          ret_val = 2 
203          LP = None 
204          while(ret_val != 0): 
205              if ret_val == 2: 
206                   
207                  LP = BM_linkPair(((LW.pair).contents).cid1, 
208                                   ((LW.pair).contents).cid2) 
209                   
210                  links[LP.makeKey()] = LP 
211               
212              LI = (LW.LI).contents 
213              LP.addLink(LI.reversed1, 
214                         LI.reversed2, 
215                         LI.pos1, 
216                         LI.pos2, 
217                         bamFile) 
218              ret_val = CW._stepLW(pLW) 
219          CW._destroyLW(pLW) 
220   
221      return links 
 222   
223   
224   
225   
226   
227   
229      ''' class for generating coverage profiles and linking read information 
230      from several BAM files 
231   
232      Uses python multiprocessing to parallelize processing 
233      ''' 
234   
235 -    def __init__(self, 
236                   coverageType, 
237                   minLength=0, 
238                   baseQuality=0, 
239                   mappingQuality=0, 
240                   useSuppAlignments=False, 
241                   useSecondaryAlignments=False, 
242                   maxMisMatches=1000 
243                   ): 
 244          ''' 
245          Default constructor. 
246   
247          Set quality thresholds used in later parsing 
248   
249          Inputs: 
250           coverageType - BM_coverageType, stores type of coverage to calculate 
251           minLength - int, ignore contigs shorter than this length 
252           mappingQuality - int, ignore positions with a lower base quality score 
253           mappingQuality - int, skip all reads with a lower mapping quality score 
254           useSuppAlignments - == True -> DON'T skip supplementary alignments 
255           useSecondaryAlignments - == True -> DON'T skip secondary alignments 
256           maxMisMatches - int, skip all reads with more mismatches (NM aux files) 
257   
258          Outputs: 
259           None 
260          ''' 
261           
262           
263           
264          self.baseQuality = baseQuality 
265          self.mappingQuality = mappingQuality 
266          self.minLength = minLength 
267          self.maxMisMatches = maxMisMatches 
268   
269          if useSuppAlignments: 
270              self.ignoreSuppAlignments = 0 
271          else: 
272              self.ignoreSuppAlignments = 1 
273   
274          if useSuppAlignments: 
275              self.ignoreSecondaryAlignments = 0 
276          else: 
277              self.ignoreSecondaryAlignments = 1 
278   
279          self.coverageType = coverageType 
280   
281           
282           
283           
284          self.BFI = None           
285   
286           
287          self.bamFiles = [] 
288          self.types = [] 
289          self.doLinks = False 
290          self.doCovs = False 
 291   
292   
293   
294   
295 -    def parseBams(self, 
296                    bamFiles, 
297                    doLinks=False, 
298                    doCovs=False, 
299                    types=None, 
300                    threads=1, 
301                    verbose=False): 
 302          '''Parse bam files to get coverage and linking reads. 
303   
304          Manages threading 
305          Stores results in internal mapping results list 
306   
307          Inputs: 
308           bamFiles - [string], full paths to BAMs 
309           doLinks - == True -> find linking pairs 
310           doCovs - == True -> calculate coverage profiles 
311           types - [int] or None, number of insert types per bamfile 
312           threads - int, max number of threads to use 
313           verbose - == True -> be verbose 
314   
315          Outputs: 
316           0 if the parsing worked, 1 otherwise 
317          ''' 
318           
319          self.bamFiles = bamFiles 
320   
321           
322          if types is None: 
323              self.types = [1]*len(self.bamFiles) 
324          else: 
325              self.types = types 
326   
327          if len(self.types) != len(self.bamFiles): 
328              raise InvalidNumberOfTypesException("%d types for %d BAM files" % \ 
329                                                  (len(self.types), 
330                                                   len(self.bamFiles))) 
331   
332           
333          self.doLinks = doLinks 
334          self.doCovs = doCovs 
335   
336           
337          for bam in bamFiles: 
338              if not os.path.isfile(bam): 
339                  raise BAMFileNotFoundException("BAM file %s not found" % bam) 
340              elif not os.path.isfile("%s.bai" % bam): 
341                  raise BAMIndexNotFoundException("Index file %s not found" % \ 
342                                                  ("%s.bai" % bam)) 
343   
344           
345          parse_queue = mp.Queue() 
346           
347          BFI_list = mp.Manager().list() 
348   
349           
350          for bid in range(len(bamFiles)): 
351              parse_queue.put(bid) 
352   
353           
354          for _ in range(threads): 
355              parse_queue.put(None) 
356   
357          try: 
358               
359              parse_proc = [mp.Process(target=externalParseWrapper, 
360                                       args = (self, 
361                                               parse_queue, 
362                                               BFI_list, 
363                                               verbose, 
364                                               True) 
365                                       ) 
366                            ] 
367               
368              parse_proc += [mp.Process(target=externalParseWrapper, 
369                                        args = (self, 
370                                                parse_queue, 
371                                                BFI_list, 
372                                                verbose, 
373                                                True) 
374                                        ) for _ in range(threads-1) 
375                             ] 
376   
377              for p in parse_proc: 
378                  p.start() 
379   
380              for p in parse_proc: 
381                  p.join() 
382   
383               
384              self.collapseBFIs(BFI_list) 
385   
386               
387              return 0 
388   
389          except  (KeyboardInterrupt, SystemExit): 
390               
391              for p in parse_proc: 
392                  p.terminate() 
393   
394               
395              return 1 
 396   
398          '''Parse a single BAM file and append the result 
399          to the internal mapping results list 
400   
401          Called from the ExternalParseWrapper 
402   
403          Inputs: 
404           bid - unique identifier of the BAM to parse 
405   
406          Outputs: 
407           A populated BM_FileInfo_C  struct containing the parsing results 
408          ''' 
409           
410           
411          BFI = BM_fileInfo_C() 
412          pBFI = c.POINTER(BM_fileInfo_C) 
413          pBFI = c.pointer(BFI) 
414   
415          BCT = BM_coverageType_C() 
416          BCT.type = self.coverageType.cType 
417          BCT.upperCut = float(self.coverageType.cUpper) 
418          BCT.lowerCut = float(self.coverageType.cLower) 
419          pBCT = c.POINTER(BM_coverageType_C) 
420          pBCT = c.pointer(BCT) 
421   
422          bamfiles_c_array = (c.c_char_p * 1)() 
423          bamfiles_c_array[:] = [self.bamFiles[bid]] 
424   
425          types_c_array = (c.c_int * 1)() 
426          types_c_array[:] = [self.types[bid]] 
427   
428          CW = CWrapper() 
429          if self.doLinks or self.doCovs: 
430              CW._parseCoverageAndLinks(self.doLinks, 
431                                        self.doCovs, 
432                                        1,         
433                                        self.baseQuality, 
434                                        self.mappingQuality, 
435                                        self.minLength, 
436                                        self.maxMisMatches, 
437                                        types_c_array, 
438                                        self.ignoreSuppAlignments, 
439                                        self.ignoreSecondaryAlignments, 
440                                        pBCT, 
441                                        bamfiles_c_array, 
442                                        pBFI) 
443          else: 
444               
445              BCT.type = CT.NONE  
446              CW._parseCoverageAndLinks(self.doLinks, 
447                                        self.doCovs, 
448                                        1,         
449                                        0, 
450                                        0, 
451                                        0, 
452                                        0, 
453                                        types_c_array, 
454                                        1, 
455                                        1, 
456                                        pBCT, 
457                                        bamfiles_c_array, 
458                                        pBFI) 
459   
460          return BFI 
 461   
463          '''Collapse multiple BFI objects into one and 
464          set it as member variable BFI 
465   
466          Only one of the threads will bother to parse contig names. Find it's 
467          BFI and then merge all the other BFIs (from other threads) into it 
468   
469          Inputs: 
470           BFI_list - Manager.List, list of all BFIs created during parsing 
471   
472          Outputs: 
473           None 
474          ''' 
475          baseBFI_index = 0 
476          if self.doCovs or self.doLinks: 
477               
478              for i in range(len(BFI_list)): 
479                  if len(BFI_list[i].contigNames) > 0: 
480                      baseBFI_index = i 
481                      break 
482   
483           
484          self.BFI = BFI_list[baseBFI_index] 
485          for i in range(len(BFI_list)): 
486              if i != baseBFI_index: 
487                  self.BFI.consume(BFI_list[i]) 
 488   
489   
490   
491   
493          '''Print template size and orientation information 
494          to a file or to stdout. 
495   
496          Inputs: 
497           fileName - string, full path to output file or "" 
498   
499          Outputs: 
500           None 
501          ''' 
502          if self.BFI is None: 
503              raise NoBAMSFoundException 
504          else: 
505              if fileName == "": 
506                  self.BFI.printBamTypes(sys.stdout) 
507              else: 
508                  with open(fileName, "w") as fh: 
509                      self.BFI.printBamTypes(fh) 
 510   
512          '''Print coverage profile information to a file or to stdout 
513   
514          Inputs: 
515           fileName - string, full path to output file or "" 
516   
517          Outputs: 
518           None 
519          ''' 
520          if self.BFI is None: 
521              raise NoBAMSFoundException 
522          else: 
523              if fileName == "": 
524                  self.BFI.printCoverages(sys.stdout) 
525              else: 
526                  with open(fileName, "w") as fh: 
527                      self.BFI.printCoverages(fh) 
 528   
530          '''Print paired links information to a file or to stdout 
531   
532          Inputs: 
533           fileName - string, full path to output file or "" 
534   
535          Outputs: 
536           None 
537          ''' 
538          if self.BFI is None: 
539              raise NoBAMSFoundException 
540          else: 
541              if fileName == "": 
542                  self.BFI.printLinks(sys.stdout) 
543              else: 
544                  with open(fileName, "w") as fh: 
545                      self.BFI.printLinks(dict(zip(range(len(self.bamFiles)),self.bamFiles)), fh) 
  546   
547   
548   
549   
550   
551