Package bamm :: Module bamParser
[hide private]
[frames] | no frames]

Source Code for Module bamm.bamParser

  1  #!/usr/bin/env python 
  2  ############################################################################### 
  3  #                                                                             # 
  4  #    BamParser.py                                                             # 
  5  #                                                                             # 
  6  #    Class for parsing BAM files                                              # 
  7  #                                                                             # 
  8  #    Copyright (C) Michael Imelfort                                           # 
  9  #                                                                             # 
 10  ############################################################################### 
 11  #                                                                             # 
 12  #    This library is free software; you can redistribute it and/or            # 
 13  #    modify it under the terms of the GNU Lesser General Public               # 
 14  #    License as published by the Free Software Foundation; either             # 
 15  #    version 3.0 of the License, or (at your option) any later version.       # 
 16  #                                                                             # 
 17  #    This library is distributed in the hope that it will be useful,          # 
 18  #    but WITHOUT ANY WARRANTY; without even the implied warranty of           # 
 19  #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU        # 
 20  #    Lesser General Public License for more details.                          # 
 21  #                                                                             # 
 22  #    You should have received a copy of the GNU Lesser General Public         # 
 23  #    License along with this library.                                         # 
 24  #                                                                             # 
 25  ############################################################################### 
 26   
 27  __author__ = "Michael Imelfort" 
 28  __copyright__ = "Copyright 2014" 
 29  __credits__ = ["Michael Imelfort"] 
 30  __license__ = "LGPLv3" 
 31  __maintainer__ = "Michael Imelfort" 
 32  __email__ = "mike@mikeimelfort.com" 
 33   
 34  ############################################################################### 
 35   
 36  # system imports 
 37  import os 
 38  import ctypes as c 
 39  from multiprocessing import Pool, Manager 
 40  import multiprocessing as mp 
 41  import numpy as np 
 42  import sys 
 43   
 44  # local imports 
 45  from cWrapper import * 
 46  from bamLink import * 
 47  from bamFile import * 
 48  from bammExceptions import * 
 49   
 50  ############################################################################### 
 51  ############################################################################### 
 52  ############################################################################### 
 53  # Multiprocessing requires that all passed items be pickleable. That is they 
 54  # must be vanilla variables or functions defined in the file itself, ie. not 
 55  # within a class. We get around this by writing an external function which calls 
 56  # a class function. Hacky, but it works. 
 57  ############################################################################### 
 58  ############################################################################### 
 59  ############################################################################### 
 60   
61 -def externalParseWrapper(bAMpARSER, 62 parseQueue, 63 BFI_list, 64 verbose, 65 doContigNames):
66 '''Single-process BAMfile parsing 67 68 cTypes pointers are unpickleable unless they are top level, so this function 69 lives outside the class. In this case we reduce the number of member 70 variables passed to it by passing the class instead. Any implicit copy 71 operations do not affect the workflow as it stands now. If you modify this 72 function you need to be aware of the limitations of python multiprocessing, 73 Queues, pickling and shared memory. 74 75 Extra logic is also contained in BamParser._parseOneBam 76 77 Inputs: 78 bAMpARSER - BamParser instance, a valid BamParser instance 79 parseQueue - Manager.Queue, bids (BAMs) yet to be parsed 80 BFI_list - Manager.List, place all processed BFIs on this list 81 verbose - == True -> be verbose 82 doContigNames - == True -> load contigs names from the C-land BFI struct 83 ''' 84 CW = CWrapper() 85 while True: 86 # get the next one off the list 87 bid = parseQueue.get(block=True, timeout=None) 88 if bid is None: # poison pill 89 break 90 91 if verbose: 92 print "Parsing file: %s" % bAMpARSER.bamFiles[bid] 93 94 # go back into the class to do the work 95 coverages = [] 96 contig_lengths = [] 97 contig_names = [] 98 links = {} 99 100 BFI = bAMpARSER._parseOneBam(bid) 101 102 # only do this if we are doing covs or links (or both) 103 if bAMpARSER.doCovs or bAMpARSER.doLinks: 104 contig_lengths = \ 105 np.array([int(i) for i in 106 c.cast(BFI.contigLengths, 107 c.POINTER(c.c_uint32*BFI.numContigs)).contents 108 ]) 109 110 coverages = np.array([[float(j) for j in c.cast(i, c.POINTER(c.c_float*BFI.numBams)).contents] for i in 111 c.cast(BFI.coverages, c.POINTER(c.POINTER(c.c_float*BFI.numBams)*BFI.numContigs)).contents]) 112 113 # we only need to do the contig names for one of the threads 114 if doContigNames: 115 contig_names = [] 116 contig_name_lengths = \ 117 np.array([int(i) for i in 118 c.cast(BFI.contigNameLengths, 119 c.POINTER(c.c_uint16*BFI.numContigs) 120 ).contents 121 ]) 122 123 contig_name_array = \ 124 c.cast(BFI.contigNames, 125 c.POINTER(c.POINTER(c.c_char)*BFI.numContigs) 126 ).contents 127 128 for i in range(BFI.numContigs): 129 contig_names.append((c.cast(contig_name_array[i], 130 c.POINTER(c.c_char * \ 131 contig_name_lengths[i] 132 ) 133 ).contents 134 ).value 135 ) 136 137 # we always populate the bam file type information classes 138 bam_file_name = bAMpARSER.bamFiles[bid] 139 BF = BM_bamFile(bid, bam_file_name) 140 BF_C = \ 141 (c.cast(BFI.bamFiles, 142 c.POINTER(c.POINTER(BM_bamFile_C)*1)).contents)[0].contents 143 144 num_types = BF_C.numTypes 145 BTs_C = c.cast(BF_C.types, 146 c.POINTER(c.POINTER(BM_bamType_C)*num_types)).contents 147 148 for bt_c in BTs_C: 149 BT = BM_bamType((bt_c.contents).orientationType, 150 (bt_c.contents).insertSize, 151 (bt_c.contents).insertStdev, 152 (bt_c.contents).supporting) 153 BF.types.append(BT) 154 155 if bAMpARSER.doLinks: 156 links = pythonizeLinks(BFI, BF) 157 else: 158 links = {} 159 160 # make the python object 161 BBFI = BM_fileInfo(coverages, 162 contig_lengths, 163 BFI.numBams, 164 BFI.numContigs, 165 contig_names, 166 [BF], 167 links) 168 169 # append onto the global list 170 BFI_list.append(BBFI) 171 172 # destroy the C-allocateed memory 173 pBFI = c.POINTER(BM_fileInfo_C) 174 pBFI = c.pointer(BFI) 175 CW._destroyBFI(pBFI) 176 177 if doContigNames: 178 # we only need to parse the contig names once 179 doContigNames = False
180 222 223 ############################################################################### 224 ############################################################################### 225 ############################################################################### 226 ############################################################################### 227
228 -class BamParser:
229 ''' class for generating coverage profiles and linking read information 230 from several BAM files 231 232 Uses python multiprocessing to parallelize processing 233 ''' 234
235 - def __init__(self, 236 coverageType, 237 minLength=0, 238 baseQuality=0, 239 mappingQuality=0, 240 useSuppAlignments=False, 241 useSecondaryAlignments=False, 242 maxMisMatches=1000 243 ):
244 ''' 245 Default constructor. 246 247 Set quality thresholds used in later parsing 248 249 Inputs: 250 coverageType - BM_coverageType, stores type of coverage to calculate 251 minLength - int, ignore contigs shorter than this length 252 mappingQuality - int, ignore positions with a lower base quality score 253 mappingQuality - int, skip all reads with a lower mapping quality score 254 useSuppAlignments - == True -> DON'T skip supplementary alignments 255 useSecondaryAlignments - == True -> DON'T skip secondary alignments 256 maxMisMatches - int, skip all reads with more mismatches (NM aux files) 257 258 Outputs: 259 None 260 ''' 261 #--------------------------------- 262 # information about how the parser will be used 263 #--------------------------------- 264 self.baseQuality = baseQuality 265 self.mappingQuality = mappingQuality 266 self.minLength = minLength 267 self.maxMisMatches = maxMisMatches 268 269 if useSuppAlignments: 270 self.ignoreSuppAlignments = 0 271 else: 272 self.ignoreSuppAlignments = 1 273 274 if useSuppAlignments: 275 self.ignoreSecondaryAlignments = 0 276 else: 277 self.ignoreSecondaryAlignments = 1 278 279 self.coverageType = coverageType 280 281 #--------------------------------- 282 # internal variables 283 #--------------------------------- 284 self.BFI = None # internal mapping results object 285 286 # these are set when we make the call to parse 287 self.bamFiles = [] 288 self.types = [] 289 self.doLinks = False 290 self.doCovs = False
291 292 #------------------------------------------------------------------------------ 293 # Bam parseratering 294
295 - def parseBams(self, 296 bamFiles, 297 doLinks=False, 298 doCovs=False, 299 types=None, 300 threads=1, 301 verbose=False):
302 '''Parse bam files to get coverage and linking reads. 303 304 Manages threading 305 Stores results in internal mapping results list 306 307 Inputs: 308 bamFiles - [string], full paths to BAMs 309 doLinks - == True -> find linking pairs 310 doCovs - == True -> calculate coverage profiles 311 types - [int] or None, number of insert types per bamfile 312 threads - int, max number of threads to use 313 verbose - == True -> be verbose 314 315 Outputs: 316 0 if the parsing worked, 1 otherwise 317 ''' 318 # set these now 319 self.bamFiles = bamFiles 320 321 # how may insert types for each bam file? 322 if types is None: 323 self.types = [1]*len(self.bamFiles) 324 else: 325 self.types = types 326 327 if len(self.types) != len(self.bamFiles): 328 raise InvalidNumberOfTypesException("%d types for %d BAM files" % \ 329 (len(self.types), 330 len(self.bamFiles))) 331 332 # make sure (again) that we're doing something 333 self.doLinks = doLinks 334 self.doCovs = doCovs 335 336 # check that the bam files and their indexes exist 337 for bam in bamFiles: 338 if not os.path.isfile(bam): 339 raise BAMFileNotFoundException("BAM file %s not found" % bam) 340 elif not os.path.isfile("%s.bai" % bam): 341 raise BAMIndexNotFoundException("Index file %s not found" % \ 342 ("%s.bai" % bam)) 343 344 # start running the parser in multithreaded mode 345 parse_queue = mp.Queue() 346 # each thread can place their new BFIs on a single global list 347 BFI_list = mp.Manager().list() 348 349 # place the bids on the queue 350 for bid in range(len(bamFiles)): 351 parse_queue.put(bid) 352 353 # place one None on the queue for each thread we have access to 354 for _ in range(threads): 355 parse_queue.put(None) 356 357 try: 358 # only the first thread and the first job should parse contig names 359 parse_proc = [mp.Process(target=externalParseWrapper, 360 args = (self, 361 parse_queue, 362 BFI_list, 363 verbose, 364 True) 365 ) 366 ] 367 # all the other threads will not parse contig names 368 parse_proc += [mp.Process(target=externalParseWrapper, 369 args = (self, 370 parse_queue, 371 BFI_list, 372 verbose, 373 True) 374 ) for _ in range(threads-1) 375 ] 376 377 for p in parse_proc: 378 p.start() 379 380 for p in parse_proc: 381 p.join() 382 383 # all processes are finished, collapse the BFI_list 384 self.collapseBFIs(BFI_list) 385 386 # success 387 return 0 388 389 except (KeyboardInterrupt, SystemExit): 390 # ctrl-c! Make sure all processes are terminated 391 for p in parse_proc: 392 p.terminate() 393 394 # dismal failure 395 return 1
396
397 - def _parseOneBam(self, bid):
398 '''Parse a single BAM file and append the result 399 to the internal mapping results list 400 401 Called from the ExternalParseWrapper 402 403 Inputs: 404 bid - unique identifier of the BAM to parse 405 406 Outputs: 407 A populated BM_FileInfo_C struct containing the parsing results 408 ''' 409 # destroy needs to be called on this 410 # -> it should be called by the calling function 411 BFI = BM_fileInfo_C() 412 pBFI = c.POINTER(BM_fileInfo_C) 413 pBFI = c.pointer(BFI) 414 415 BCT = BM_coverageType_C() 416 BCT.type = self.coverageType.cType 417 BCT.upperCut = float(self.coverageType.cUpper) 418 BCT.lowerCut = float(self.coverageType.cLower) 419 pBCT = c.POINTER(BM_coverageType_C) 420 pBCT = c.pointer(BCT) 421 422 bamfiles_c_array = (c.c_char_p * 1)() 423 bamfiles_c_array[:] = [self.bamFiles[bid]] 424 425 types_c_array = (c.c_int * 1)() 426 types_c_array[:] = [self.types[bid]] 427 428 CW = CWrapper() 429 if self.doLinks or self.doCovs: 430 CW._parseCoverageAndLinks(self.doLinks, 431 self.doCovs, 432 1, # numBams always one here 433 self.baseQuality, 434 self.mappingQuality, 435 self.minLength, 436 self.maxMisMatches, 437 types_c_array, 438 self.ignoreSuppAlignments, 439 self.ignoreSecondaryAlignments, 440 pBCT, 441 bamfiles_c_array, 442 pBFI) 443 else: 444 # types only 445 BCT.type = CT.NONE # just to be sure! 446 CW._parseCoverageAndLinks(self.doLinks, 447 self.doCovs, 448 1, # numBams always one here 449 0, 450 0, 451 0, 452 0, 453 types_c_array, 454 1, 455 1, 456 pBCT, 457 bamfiles_c_array, 458 pBFI) 459 460 return BFI
461
462 - def collapseBFIs(self, BFI_list):
463 '''Collapse multiple BFI objects into one and 464 set it as member variable BFI 465 466 Only one of the threads will bother to parse contig names. Find it's 467 BFI and then merge all the other BFIs (from other threads) into it 468 469 Inputs: 470 BFI_list - Manager.List, list of all BFIs created during parsing 471 472 Outputs: 473 None 474 ''' 475 baseBFI_index = 0 476 if self.doCovs or self.doLinks: 477 # all the BFIs are made. Only one has the contig IDs. find it. 478 for i in range(len(BFI_list)): 479 if len(BFI_list[i].contigNames) > 0: 480 baseBFI_index = i 481 break 482 483 # merge all the separate mapping results 484 self.BFI = BFI_list[baseBFI_index] 485 for i in range(len(BFI_list)): 486 if i != baseBFI_index: 487 self.BFI.consume(BFI_list[i])
488 489 #------------------------------------------------------------------------------ 490 # Printing and IO 491
492 - def printBamTypes(self, fileName=""):
493 '''Print template size and orientation information 494 to a file or to stdout. 495 496 Inputs: 497 fileName - string, full path to output file or "" 498 499 Outputs: 500 None 501 ''' 502 if self.BFI is None: 503 raise NoBAMSFoundException 504 else: 505 if fileName == "": 506 self.BFI.printBamTypes(sys.stdout) 507 else: 508 with open(fileName, "w") as fh: 509 self.BFI.printBamTypes(fh)
510
511 - def printCoverages(self, fileName=""):
512 '''Print coverage profile information to a file or to stdout 513 514 Inputs: 515 fileName - string, full path to output file or "" 516 517 Outputs: 518 None 519 ''' 520 if self.BFI is None: 521 raise NoBAMSFoundException 522 else: 523 if fileName == "": 524 self.BFI.printCoverages(sys.stdout) 525 else: 526 with open(fileName, "w") as fh: 527 self.BFI.printCoverages(fh)
528
546 547 ############################################################################### 548 ############################################################################### 549 ############################################################################### 550 ############################################################################### 551