Python pysam 模块,Samfile() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pysam.Samfile()

项目:NGaDNAP    作者:theboocock    | 项目源码 | 文件源码
def filter_bam(input_file, output_file, downweight_number, ctot, gtoa):
    """
        Takes a bam file as input and weights the quality of the reads down.

        Need to ensure we write the header out :)

        Investigate pysam and look for a header,
        this should really help us understand how to get this bam filter working 
        and writing the bam files directly back out to the terminal.
    """
    bam = pysam.Samfile(input_file,'rb')
    bam_out = pysam.Samfile(output_file, 'wb',template=bam) 
    for line in bam:
        change_bases_c = None
        change_bases_t = None
        seq = line.seq
        qual = line.qual
        if(ctot):
            change_bases_c = [check_c_2_t(nuc) and i < downweight_number for i, nuc in enumerate(seq)]
        if(gtoa):
            change_bases_t = [check_g_2_a(nuc) and (len(seq)-i) <= downweight_number for i, nuc in enumerate(seq)]
        new_qual = downweight_quality(qual,change_bases_c, change_bases_t)
        line.qual = new_qual
        bam_out.write(line)
项目:Comparative-Annotation-Toolkit    作者:ComparativeGenomicsToolkit    | 项目源码 | 文件源码
def validate_bam_fasta_pairs(bam_path, fasta_sequences, genome):
    """
    Make sure that this BAM is actually aligned to this fasta. Every sequence should be the same length. Sequences
    can exist in the reference that do not exist in the BAM, but not the other way around.
    """
    handle = pysam.Samfile(bam_path, 'rb')
    bam_sequences = {(n, s) for n, s in zip(*[handle.references, handle.lengths])}
    difference = bam_sequences - fasta_sequences
    if len(difference) > 0:
        base_err = 'Error: BAM {} has the following sequence/length pairs not found in the {} fasta: {}.'
        err = base_err.format(bam_path, genome, ','.join(['-'.join(map(str, x)) for x in difference]))
        raise UserException(err)
    missing_seqs = fasta_sequences - bam_sequences
    if len(missing_seqs) > 0:
        base_msg = 'BAM {} does not have the following sequence/length pairs in its header: {}.'
        msg = base_msg.format(bam_path, ','.join(['-'.join(map(str, x)) for x in missing_seqs]))
        logger.warning(msg)
项目:HiCembler    作者:lpryszcz    | 项目源码 | 文件源码
def main(bam):

    # get contig2size
    #faidx = FastaIndex(fasta)
    #contig2size = {c: faidx.id2stats[c][0] for c in faidx}
    sam = pysam.Samfile(bam)
    contig2size = {c: s for c, s in zip(sam.references, sam.lengths)}

    # estimate on largest contigs
    longest_contigs = sorted(contig2size, key=lambda x: contig2size[x], reverse=1)
    totdata = [0, 0]
    for c in longest_contigs[:25]:
        data = bam2matches(bam, regions=[c], mapq=10)
        print c, contig2size[c], 1.*data[0] / sum(data), sum(data)
        for i in range(2):
            totdata[i] += data[i]
    data = totdata
    print 1.*data[0] / sum(data), sum(data)
项目:clrsvsim    作者:color    | 项目源码 | 文件源码
def test_make_split_read_bam_file(self):
        sorted_bam = path.join(TEST_DATA_DIR, 'sorted.bam')
        with pysam.Samfile(sorted_bam, 'rb') as samfile:
            for read in samfile:
                if not read.cigarstring:
                    continue

                for breakpoint in (10, 50, 100):
                    if breakpoint >= read.rlen:
                        continue

                    for is_left_split in (True, False):
                        split_read = make_split_read(read, breakpoint, is_left_split)
                        cigar_items = list(Cigar(split_read.cigarstring).items())
                        clipped_item = cigar_items[0] if is_left_split else cigar_items[-1]
                        min_clip_len = breakpoint if is_left_split else read.rlen - breakpoint  # Can be longer if adjacent to another clip.
                        self.assertGreaterEqual(clipped_item[0], min_clip_len)
                        self.assertIn(clipped_item[1], ('S', 'H'))  # Will be soft-clipped unless already hard-clipped.
项目:workflow    作者:hmkim    | 项目源码 | 文件源码
def chimeric_reads(bamfile, virus_contig, duplicates):
    igv_chimeric_file = os.path.splitext(args.bamfile)[0] + ".chimeric.igv.bam"
    if os.path.exists(igv_chimeric_file):
        return igv_chimeric_file
    with pysam.Samfile(args.bamfile, "rb") as in_handle, \
         pysam.Samfile(igv_chimeric_file, "wb", template=in_handle) as out_handle:
        for read in in_handle:
            try:
                chrom = in_handle.getrname(read.tid)
            except ValueError:
                continue
            if not is_chimeric_read(read, chrom, virus_contig):
                continue
            if read.qname in duplicates:
                continue
            out_handle.write(read)
    return igv_chimeric_file
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def merge_by_key(bam_filenames, key_func, bam_out):
    file_cache = tk_cache.FileHandleCache(mode='rb', open_func=pysam.Samfile)
    total_reads = 0
    heap  = []

    for bam_filename in bam_filenames:
        try:
            bam = file_cache.get(bam_filename)
            first_read = bam.next()
            heapq.heappush(heap, (key_func(first_read), first_read, bam_filename))
        except StopIteration:
            pass

    while len(heap) > 0:
        # Get the minimum item and write it to the bam.
        key, read, bam_filename = heapq.heappop(heap)
        bam = file_cache.get(bam_filename)
        bam_out.write(read)
        total_reads += 1

        # Get the next read from the source bam we just wrote from
        # If that BAM file is out of reads, then we leave that one out
        try:
            next_read = bam.next()
            heapq.heappush(heap, (key_func(next_read), next_read, bam_filename))
        except StopIteration:
            pass

    return total_reads
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def create_bam_outfile(file_name, chrom_names, chrom_lengths, template=None, pgs=None, cos=None, rgs=None, replace_rg=False):
    """ Creates a bam file with given chromosome names and lengths.
    template is an existing bam file.  If it is specified, chrom_names and chrom_lengths
    are ignored. pg is dictionary specifying a 'PG' entry. ID field is required; PN/CL/PP/DS/VN fields are optional.
    rgs is a list of dicts specifiying an 'RG' entry. If replace_rg is True, the existing 'RG' entry is overwritten.
    """
    if template:
        header = template.header
        if pgs is not None:
            for pg in pgs:
                if not header.has_key('PG'):
                    header['PG'] = []
                # add in the PP field based on previous PG entry
                if len(header['PG']) > 0:
                    pp = header['PG'][-1]['ID']
                    if pp is not None:
                        pg['PP'] = pp
                header['PG'].append(pg)

        if cos is not None:
            for co in cos:
                if not header.has_key('CO'):
                    header['CO'] = []
                header['CO'].append(co)

        if rgs is not None:
            if replace_rg and header.has_key('RG') and len(rgs) > 0:
                header['RG'] = []
            for rg in rgs:
                if not header.has_key('RG'):
                    header['RG'] = []
                header['RG'].append(rg)

        bam_file = pysam.Samfile(file_name, 'wb', header=header)
        tids = {name:n for (n, name) in enumerate(template.references)}
    else:
        header = {'SQ': [{'SN': chrom_names[n], 'LN': chrom_lengths[n]} for n in xrange(len(chrom_names))]}
        bam_file = pysam.Samfile(file_name, 'wb', header=header)
        tids = {chrom_names[n]:n for n in xrange(len(chrom_names))}
    return bam_file, tids
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def create_bam_infile(file_name):
    bam_file = pysam.Samfile(file_name, 'rb')
    return bam_file
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def create_sam_infile(file_name):
    sam_file = pysam.Samfile(file_name, 'r')
    return sam_file
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def sort_by_name(file_name, sorted_prefix=None):
    """ Sorts a bam file by the read name, for paired-end
    """
    if sorted_prefix is None:
        sorted_prefix = file_name.replace('.bam', '') + '_namesorted'

    sorted_name = sorted_prefix + '.bam'
    # NOTE -- need to update our internal samtools in order to use pysam.sort
    #pysam.sort('-n', file_name, sorted_prefix)
    subprocess.check_call(['samtools', 'sort', '-n', file_name, sorted_prefix])

    return pysam.Samfile(sorted_name, 'rb')
项目:CLAM    作者:Xinglab    | 项目源码 | 文件源码
def filter_multihits(filename, max_hits, verbose, tmp_dir):
    """
    Pre-processing function for cleaning up the input bam file.
    """
    if verbose:
        print_time_stamp('filtering multi-mapped up to %d hits.' % max_hits)
    multiread_set=set()
    subprocess.call("samtools view -h %s | awk '{if($2 !~ /_/ && $3 !~ /_/) {print}}' | samtools view -bS - > %s/filter_random.bam" % (filename, tmp_dir), shell=True)
    oldfile=pysam.Samfile(tmp_dir + '/filter_random.bam','rb')
    new_filename=os.path.abspath(tmp_dir + '/filter%d.bam' % max_hits)
    sorted_filename=os.path.abspath(tmp_dir + '/filter%d.sorted.bam' % max_hits)
    newfile=pysam.Samfile(new_filename, 'wb', template=oldfile)
    for aligned_read in oldfile:
        try:
            if aligned_read.opt("NH") < max_hits:
                newfile.write(aligned_read)
                if aligned_read.opt("NH")>1:
                    multiread_set.add(aligned_read.qname)
        except KeyError:
            newfile.write(aligned_read)
    oldfile.close()
    newfile.close()
    sort_cmd='samtools sort -T %s/ -o %s %s' % (tmp_dir, sorted_filename, new_filename)
    index_cmd='samtools index %s' % sorted_filename
    subprocess.call(sort_cmd, shell=True)
    subprocess.call(index_cmd, shell=True)
    subprocess.call('rm %s/filter_random.bam %s' % (tmp_dir, new_filename), shell=True)
    pickle.dump(multiread_set, open(tmp_dir + '/multiread_set.pdata', 'wb'), -1)
    return(sorted_filename, multiread_set)
项目:CLAM    作者:Xinglab    | 项目源码 | 文件源码
def peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=False, with_replicates=False, with_control=False, unstranded=False):
    """DOCSTRING
    Args:
    Returns:
    """
    # file handlers
    mbam = pysam.Samfile(os.path.join(out_dir, 'realigned.sorted.bam'),'rb')
    ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
    bam_dict = {'ubam.ip':[ubam], 'mbam.ip':[mbam]}
    if unique_only:
        ofile = open(os.path.join(out_dir, 'narrow_peaks.unique.bed'), 'w')
    else:
        ofile = open(os.path.join(out_dir, 'narrow_peaks.combined.bed'), 'w')

    # read in GTF
    gene_annot = read_gtf(gtf_fp)

    # iteratively call peaks in each gene
    peak_counter = 0
    for gene_name in tqdm(gene_annot):
        gene = gene_annot[gene_name]
        BED = call_gene_peak(bam_dict, gene, 
            unique_only=unique_only, with_control=with_control, 
            unstranded=unstranded)
        ofile.write(BED)
        #print BED
        peak_counter += len(BED.split('\n'))
    ofile.close()
    logger.info('called %i peaks'%peak_counter)
    return
项目:CLAM    作者:Xinglab    | 项目源码 | 文件源码
def make_bam_handler_dict(ip_bam_list, con_bam_list):
    """DOCSTRING
    Args
    Returns
    """
    bam_dict = defaultdict(list)
    try:
        ubam_ip, mbam_ip = ip_bam_list
    except:
        ubam_ip = ip_bam_list[0]
        mbam_ip = None
    for bam_fn in ubam_ip.split(','):
        if not os.path.isfile(bam_fn):
            raise Exception('%s not found'%bam_fn)
        bam_dict['ubam.ip'].append( pysam.Samfile(bam_fn, 'rb') )
    if mbam_ip is not None:
        for bam_fn in mbam_ip.split(','):
            if not os.path.isfile(bam_fn):
                raise Exception('%s not found'%bam_fn)
            bam_dict['mbam.ip'].append( pysam.Samfile(bam_fn, 'rb') )

    if con_bam_list is None:
        return bam_dict
    try:
        ubam_con, mbam_con = con_bam_list
    except:
        ubam_con = con_bam_list[0]
        mbam_con = None
    for bam_fn in ubam_con.split(','):
        if not os.path.isfile(bam_fn):
            raise Exception('%s not found'%bam_fn)
        bam_dict['ubam.con'].append( pysam.Samfile(bam_fn, 'rb') )
    if mbam_con is not None:
        for bam_fn in mbam_con.split(','):
            if not os.path.isfile(bam_fn):
                raise Exception('%s not found'%bam_fn)
            bam_dict['mbam.con'].append( pysam.Samfile(bam_fn, 'rb') )
    return bam_dict
项目:CLAM    作者:Xinglab    | 项目源码 | 文件源码
def filter_multihits(filename, max_hits, verbose, tmp_dir):
    """
    Pre-processing function for cleaning up the input bam file.
    """
    if verbose:
        print_time_stamp('filtering multi-mapped up to %d hits.' % max_hits)
    multiread_set=set()
    subprocess.call("samtools view -h %s | awk '{if($2 !~ /_/ && $3 !~ /_/) {print}}' | samtools view -bS - > %s/filter_random.bam" % (filename, tmp_dir), shell=True)
    oldfile=pysam.Samfile(tmp_dir + '/filter_random.bam','rb')
    new_filename=os.path.abspath(tmp_dir + '/filter%d.bam' % max_hits)
    sorted_filename=os.path.abspath(tmp_dir + '/filter%d.sorted.bam' % max_hits)
    newfile=pysam.Samfile(new_filename, 'wb', template=oldfile)
    for aligned_read in oldfile:
        try:
            if aligned_read.opt("NH") < max_hits:
                newfile.write(aligned_read)
                if aligned_read.opt("NH")>1:
                    multiread_set.add(aligned_read.qname)
        except KeyError:
            newfile.write(aligned_read)
    oldfile.close()
    newfile.close()
    sort_cmd='samtools sort -T %s/ -o %s %s' % (tmp_dir, sorted_filename, new_filename)
    index_cmd='samtools index %s' % sorted_filename
    subprocess.call(sort_cmd, shell=True)
    subprocess.call(index_cmd, shell=True)
    subprocess.call('rm %s/filter_random.bam %s' % (tmp_dir, new_filename), shell=True)
    pickle.dump(multiread_set, open(tmp_dir + '/multiread_set.pdata', 'wb'), -1)
    return(sorted_filename, multiread_set)
项目:RSeQC    作者:MonashBioinformaticsPlatform    | 项目源码 | 文件源码
def main():
    usage="%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage,version="%prog " + __version__)
    parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM format. BAM file should be sorted and indexed.")
    parser.add_option("-n","--subset-num",action="store",type="int",dest="subset_num",help="Number of small BAM files")
    parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output BAM files. Output \"Prefix_num.bam\".")
    parser.add_option("-s","--skip-unmap",action="store_true",dest="skip_unmap", help="Skip unmapped reads.")
    (options,args)=parser.parse_args()

    if not (options.input_file and options.subset_num and options.output_prefix):
        parser.print_help()
        sys.exit(0)
    if not os.path.exists(options.input_file):
        print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
        sys.exit(0)     

    samfile = pysam.Samfile(options.input_file,'rb')

    sub_bam = {}
    count_bam={}
    for i in range(0,options.subset_num):
        sub_bam[i] = pysam.Samfile(options.output_prefix + '_' + str(i) +'.bam','wb',template=samfile)
        count_bam[i] = 0

    total_alignment = 0
    print >>sys.stderr, "Dividing " + options.input_file + " ...",
    try:
        while(1):
            aligned_read = samfile.next()
            if aligned_read.is_unmapped and options.skip_unmap is True:
                continue
            total_alignment += 1
            tmp = randrange(0,options.subset_num)
            sub_bam[tmp].write(aligned_read)
            count_bam[tmp] += 1

    except StopIteration:
        print >>sys.stderr, "Done"

    for i in range(0,options.subset_num):
        print "%-55s%d" %  (options.output_prefix + '_' + str(i) +'.bam', count_bam[i])
项目:RSeQC    作者:MonashBioinformaticsPlatform    | 项目源码 | 文件源码
def main():
    usage="%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage,version="%prog " + __version__)
    parser.add_option("-i","--input",action="store",type="string",dest="input_file",help="Input BAM file") 
    parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in BED format. Must be strandard 12-column BED file. [required]")
    parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")
    parser.add_option("-n","--frag-num",action="store",type="int",dest="fragment_num",default=3,help="Minimum number of fragment. default=%default")

    (options,args)=parser.parse_args()

    if not (options.input_file and options.refgene_bed):
        parser.print_help()
        sys.exit(0)
    if not os.path.exists(options.input_file + '.bai'):
        print >>sys.stderr, "cannot find index file of input BAM file"
        print >>sys.stderr, options.input_file + '.bai' + " does not exists"
        sys.exit(0) 

    for file in (options.input_file, options.refgene_bed):
        if not os.path.exists(file):
            print >>sys.stderr, file + " does NOT exists" + '\n'
            sys.exit(0)

    print '\t'.join([str(i) for i in ("chrom","tx_start", "tx_end","symbol","frag_count","frag_mean","frag_median","frag_std")])
    for tmp in fragment_size(options.refgene_bed, pysam.Samfile(options.input_file), options.map_qual, options.fragment_num):
        print tmp
项目:RSeQC    作者:MonashBioinformaticsPlatform    | 项目源码 | 文件源码
def __init__(self,inputFile):
        '''constructor. input could be bam or sam'''
        try:
            self.samfile = pysam.Samfile(inputFile,'rb')
            if len(self.samfile.header) ==0:
                print >>sys.stderr, "BAM/SAM file has no header section. Exit!"
                sys.exit(1)
            self.bam_format = True
        except:
            self.samfile = pysam.Samfile(inputFile,'r')
            if len(self.samfile.header) ==0:
                print >>sys.stderr, "BAM/SAM file has no header section. Exit!"
                sys.exit(1)
            self.bam_format = False
项目:lapels    作者:shunping    | 项目源码 | 文件源码
def fixmate(infile, outfile):
    inbam = pysam.Samfile(infile, 'rb')
    outbam = pysam.Samfile(outfile, 'wb', header=inbam.header, 
                           referencenames=inbam.references)
    qname = None    
    nTotal = 0
    nFixed = 0
    count = 0;        
    reads = []    
    gc.disable()    
    for rseq in inbam.fetch(until_eof=True):
        nTotal += 1        
        if qname is None or qname == rseq.qname:
            qname = rseq.qname
            reads.append(rseq)            
        else:            
            count = process(reads, inbam.getrname) 
            if count > 0:
                for r in reads:
                    outbam.write(r)
                nFixed += count                
            qname = rseq.qname
            del reads
            reads = [rseq]
        if nTotal % 200000 == 0:        
            logger.info('%d read(s) fixed' % nTotal)
            gc.enable()
            gc.disable()

    count = process(reads, inbam.getrname)
    if count > 0:
        for r in reads:
            outbam.write(r)
        nFixed += count
    logger.info('%d read(s) processed' % nTotal)
    logger.info('%d read(s) fixed and written to file' % nFixed)

    inbam.close()
    outbam.close()
    return (nTotal, nFixed)
项目:genepred    作者:egorbarsukoff    | 项目源码 | 文件源码
def genes_alignments(outdir):
    genes_align = {}
    align_file = pysam.Samfile(outdir+'ORFs_on_genes_align.sam', 'rb')
    for rec in align_file:
        if rec.cigarstring is not None:
            gene_in_contig_len = int(re.findall(r'_len_(\d+)', rec.reference_name)[0])
            total_match = sum([i[1] for i in rec.cigar if i[0] == 0])
            conf = float(re.findall(r'_conf_([0-9]*\.[0-9]*)', rec.reference_name)[0])
            if gene_in_contig_len / total_match > 0.98:
                if genes_align.get(rec.reference_name) is None:
                    genes_align[rec.reference_name] = conf, set()
                genes_align[rec.reference_name][1].add(rec.qname)
    return genes_align
项目:Comparative-Annotation-Toolkit    作者:ComparativeGenomicsToolkit    | 项目源码 | 文件源码
def bam_is_paired(bam_path, num_reads=20000, paired_cutoff=0.75):
    """
    Infers the paired-ness of a bam file.
    """
    sam = pysam.Samfile(bam_path)
    count = 0
    for rec in itertools.islice(sam, num_reads):
        if rec.is_paired:
            count += 1
    if tools.mathOps.format_ratio(count, num_reads) > 0.75:
        return True
    elif tools.mathOps.format_ratio(count, num_reads) < 1 - paired_cutoff:
        return False
    else:
        raise UserException("Unable to infer pairing from bamfile {}".format(bam_path))
项目:Comparative-Annotation-Toolkit    作者:ComparativeGenomicsToolkit    | 项目源码 | 文件源码
def is_bam(path):
    """Checks if a path is a BAMfile"""
    try:
        pysam.Samfile(path)
    except IOError:
        raise RuntimeError('Path {} does not exist'.format(path))
    except ValueError:
        return False
    return True
项目:bamgineer    作者:pughlab    | 项目源码 | 文件源码
def renamereads(inbamfn, outbamfn):

    inbam = pysam.Samfile(inbamfn, 'rb')
    outbam = pysam.Samfile(outbamfn, 'wb', template=inbam)

    paired = {}

    n = 0
    p = 0
    u = 0
    w = 0
    m = 0

    for read in inbam.fetch(until_eof=True):
        n += 1
        if read.is_paired:
            p += 1
            if read.qname in paired:
                uuid = paired[read.qname]
                del paired[read.qname]
                read.qname = uuid
                outbam.write(read)
                w += 1
                m += 1
            else:
                newname = str(uuid4())
                paired[read.qname] = newname
                read.qname = newname
                outbam.write(read)
                w += 1
        else:
            u += 1
            read.qname = str(uuid4())
            outbam.write(read)
            w += 1

    outbam.close()
    inbam.close()
项目:bamgineer    作者:pughlab    | 项目源码 | 文件源码
def renamereads(inbamfn, outbamfn):

    inbam = pysam.Samfile(inbamfn, 'rb')
    outbam = pysam.Samfile(outbamfn, 'wb', template=inbam)

    paired = {}

    n = 0
    p = 0
    u = 0
    w = 0
    m = 0

    for read in inbam.fetch(until_eof=True):
        n += 1
        if read.is_paired:
            p += 1
            if read.qname in paired:
                uuid = paired[read.qname]
                del paired[read.qname]
                read.qname = uuid
                outbam.write(read)
                w += 1
                m += 1
            else:
                newname = str(uuid4())
                paired[read.qname] = newname
                read.qname = newname
                outbam.write(read)
                w += 1
        else:
            u += 1
            read.qname = str(uuid4())
            outbam.write(read)
            w += 1

    outbam.close()
    inbam.close()
项目:bamgineer    作者:pughlab    | 项目源码 | 文件源码
def renamereads(inbamfn, outbamfn):

    inbam = pysam.Samfile(inbamfn, 'rb')
    outbam = pysam.Samfile(outbamfn, 'wb', template=inbam)

    paired = {}
    n = 0;p = 0;u = 0;w = 0;m = 0

    for read in inbam.fetch(until_eof=True):
        n += 1
        if read.is_paired:
            p += 1
            if read.qname in paired:
                uuid = paired[read.qname]
                del paired[read.qname]
                read.qname = uuid
                outbam.write(read)
                w += 1;m += 1
            else:
                newname = str(uuid4())
                paired[read.qname] = newname
                read.qname = newname
                outbam.write(read)
                w += 1
        else:
            u += 1
            read.qname = str(uuid4())
            outbam.write(read)
            w += 1
    outbam.close()
    inbam.close()
项目:bamgineer    作者:pughlab    | 项目源码 | 文件源码
def removeReadsOverlappingHetRegion(inbamfn, bedfn,outbamfn,path):
    print "___ removing reads overlapping heterozygous region ___"
    inbamsorted =  sub('.bam$','.sorted',inbamfn)
    pysam.sort(inbamfn, inbamsorted)
    pysam.index(inbamsorted+'.bam')

    alignmentfile = pysam.AlignmentFile(inbamsorted+'.bam', "rb" )
    outbam = pysam.Samfile(outbamfn, 'wb', template=alignmentfile )

    bedfile = open(bedfn, 'r')

    for bedline in bedfile:
        c = bedline.strip().split()

        if (len(c) == 3 ):
            chr2 = c[0]
            chr = c[0].strip("chr")
            start = int(c[1])
            end   = int(c[2])
        else :
            continue

        try:
            readmappings = alignmentfile.fetch(chr2, start, end)
        except  ValueError as e:
            print("problem fetching the read ")

        for shortread in readmappings:
            try:
                outbam.write(shortread)
            except ValueError as e:
                print ("problem removing read :" + shortread.qname)
    outbamsorted =  sub('.bam$','.sorted',outbamfn)            
    pysam.sort(outbamfn, outbamsorted)
    bamDiff(inbamsorted+'.bam', outbamsorted +'.bam', path )
    outbam.close()
项目:tHapMix    作者:Illumina    | 项目源码 | 文件源码
def pileup(bamfile_name):
   samfile = pysam.Samfile(bamfile_name,"rb")
   output_pileup = open("test.cov", "w")
   sum_cov=0
   for pos_pileup in samfile.pileup():
       sum_cov+=pos_pileup.n
       if not pos_pileup.pos % 1e3:
          av_cov=int(sum_cov/1e3)
          sum_cov=0
          print(pos_pileup.tid,pos_pileup.pos,av_cov,file=output_pileup)
项目:Topsorter    作者:hanfang    | 项目源码 | 文件源码
def buildcov(self):
       """
          Builds Barcode counts of the reads aligned to the genome
          input : bam_file and bed_file
          output : Returns bed file with Chr, start, end, barcode, no of occurences, strand
       """
       before = datetime.now()
       barcode_pattern = re.compile(r'BX:(A-Z)?:[ACGT]*-[0-9]')
       self.__LOGGER.info("WARNING: Dropping Reads Alignments that are not barcoded")
       output_bed = open(self.barcode_bed,"wb") 
       with open(self.bed_file) as regions:
             with pysam.Samfile(self.obam_file, "rb") as samfile:
             for line in regions:
            chr, start, end , info = line.rstrip('\n').rstrip('\r').split("\t")
            b_dict = {}
            try:
                        readsinregion=samfile.fetch(chr,int(start),int(end))
                        for read in readsinregion:
                    tags = dict(read.tags)
                    if 'BX' in tags.keys(): 
                    if tags['BX'] in b_dict: b_dict[tags['BX']] += 1
                    else: b_dict[tags['BX']] = 1
                    except:
                        self.__LOGGER.info("Skipping region : {0}:{1}:{2}".format(chr,start,end))
                        continue 
                for k, v in b_dict.items():
                        output_bed.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chr,start,end,info,k,v)) 

       regions.close()
       samfile.close()
       output_bed.close()
       after = datetime.now()
       self.__LOGGER.info('Computed Barcode profiles for the vcf regions completed in {0}'.format(str(after - before)))
项目:brie    作者:huangyh09    | 项目源码 | 文件源码
def load_samfile(sam_file):
    """To load an indexed bam file"""
    ftype = sam_file.split(".")[-1]
    if ftype != "bam" and ftype != "sam":
        print("Error: file type need suffix of bam or sam.")
        sys.exit(1)
    # print("Loading a %s" %ftype + " with file name: %s" %sam_file)
    if ftype == "bam":
        samfile = pysam.Samfile(sam_file, "rb")
    else:
        samfile = pysam.Samfile(sam_file, "r")
    return samfile
项目:PBSuite    作者:dbrowneup    | 项目源码 | 文件源码
def run(self):
        try: 
            proc_name = self.name
            #Open the bams
            nBams = [] # nonTrim Bams
            tBams = [] # trim Bams
            for i in self.args.bam:
                nBams.append(pysam.Samfile(i))
            for i in self.args.pacBam:
                tBams.append(pysam.Samfile(i))

            while True:
                next_task = self.task_queue.get()
                if next_task is None:
                    # Poison pill means shutdown
                    logging.info('Thread %s: Exiting\n' % proc_name)
                    self.task_queue.task_done()
                    break
                try:
                    answer = next_task(nBams, tBams)
                except Exception as e:
                    logging.error("Exception raised in task %s" % (str(e)))
                    self.task_queue.task_done()
                    self.result_queue.put("Failure - UNK - %s" % str(e))
                    logging.info("fail in groupid=%s" % next_task.data.name)
                    continue
                self.task_queue.task_done()
                self.result_queue.put(answer)
            return
        except Exception as e:
            logging.error("Consumer %s Died\nERROR: %s" % (self.name, e))
            return
项目:PBSuite    作者:dbrowneup    | 项目源码 | 文件源码
def mapTails(bam, args):
    if bam.endswith('.bam'):
        bam = pysam.Samfile(bam,'rb')
    elif bam.endswith('.sam'):
        bam = pysam.Samfile(bam)
    else:
        logging.error("Cannot open input file! %s" % (bam))
        exit(1)

    logging.info("Extracting tails")
    tailfastq = tempfile.NamedTemporaryFile(suffix=".fastq", delete=False, dir=args.temp)
    tailfastq.close(); tailfastq = tailfastq.name
    r, t, m = extractTails(bam, outFq=tailfastq, minLength=args.minTail)

    logging.info("Parsed %d reads" % (r))
    logging.info("Found %d tails" % (t))
    logging.info("%d reads had double tails" % (m))
    if t == 0:
        logging.info("No tails -- Exiting")
        exit(0)

    tailmap = tempfile.NamedTemporaryFile(suffix=".sam", delete=False, dir=args.temp)
    tailmap.close(); tailmap = tailmap.name

    callBlasr(tailfastq, args.reference, args.params, args.nproc, tailmap)
    bam.close() #reset
    return tailmap
项目:PBSuite    作者:dbrowneup    | 项目源码 | 文件源码
def __init__(self, task_queue, result_queue, bamName, referenceName, honH5): #*args, **kwargs):
        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.result_queue = result_queue
        self.bam = pysam.Samfile(bamName)
        if referenceName is not None:
            self.reference = pysam.Fastafile(referenceName)
        else:
            self.reference = None
        self.honH5 = honH5
        #self.args = args
        #self.kwargs = kwargs
项目:PBSuite    作者:dbrowneup    | 项目源码 | 文件源码
def __call__(self, bam, reference, honH5):
        """
        Takes a pysam.Samfile
        """
        logging.info("Starting %s" % (self.name))
        size = self.end - self.start
        regName =  "%s:%d-%d" % (self.chrom, self.start, self.end)

        logging.debug("Making container for %s (%s %d bp)" % (self.groupName, regName, size))

        logging.debug("Parsing bam" )
        st = max(0, self.start)
        readCount = bam.count(self.chrom, st, self.end)
        reads = bam.fetch(self.chrom, st, self.end)
        if readCount == 0:
            logging.warning("No reads found in %s" % self.groupName)
            self.failed = True
            self.errMessage = "No reads found in %s" % self.groupName
            return
        else:
            logging.info("%d reads to parse in %s" % (readCount, self.groupName))

        myData = self.countErrors(reads, self.start, size, self.args)

        #request loc on honH5 and flush results
        with honH5.acquireH5('a') as h5dat:
            out = h5dat.create_group(self.groupName)

            out.attrs["reference"] = self.chrom
            out.attrs["start"] = self.start
            out.attrs["end"] = self.end

            if size < CHUNKSHAPE[1]:
                chunk = (CHUNKSHAPE[0], size-1)
            else:
                chunk = CHUNKSHAPE

            container = out.create_dataset("data", data = myData, \
                                    chunks=chunk, compression="gzip")
            h5dat.flush()
项目:PBSuite    作者:dbrowneup    | 项目源码 | 文件源码
def __call__(self, bam, reference, honH5):
        """
        Takes a pysam.Samfile
        """
        logging.info("Starting %s" % (self.name))
        size = self.end - self.start
        regName =  "%s:%d-%d" % (self.chrom, self.start, self.end)

        logging.debug("Making container for %s (%s %d bp)" % (self.groupName, regName, size))

        logging.debug("Parsing bam" )
        st = max(0, self.start)
        readCount = bam.count(self.chrom, st, self.end)
        reads = bam.fetch(self.chrom, st, self.end)
        if readCount == 0:
            logging.warning("No reads found in %s" % self.groupName)
            self.failed = True
            self.errMessage = "No reads found in %s" % self.groupName
            return
        else:
            logging.info("%d reads to parse in %s" % (readCount, self.groupName))

        myData = self.countErrors(reads, self.start, size, self.args)

        #request loc on honH5 and flush results
        with honH5.acquireH5('a') as h5dat:
            out = h5dat.create_group(self.groupName)

            out.attrs["reference"] = self.chrom
            out.attrs["start"] = self.start
            out.attrs["end"] = self.end

            if size < CHUNKSHAPE[1]:
                chunk = (CHUNKSHAPE[0], size-1)
            else:
                chunk = CHUNKSHAPE

            container = out.create_dataset("data", data = myData, \
                                    chunks=chunk, compression="gzip")
            h5dat.flush()
项目:PBSuite    作者:dbrowneup    | 项目源码 | 文件源码
def mapTails(bam, args):
    if bam.endswith('.bam'):
        bam = pysam.Samfile(bam,'rb')
    elif bam.endswith('.sam'):
        bam = pysam.Samfile(bam)
    else:
        logging.error("Cannot open input file! %s" % (bam))
        exit(1)

    logging.info("Extracting tails")
    tailfastq = tempfile.NamedTemporaryFile(suffix=".fastq", delete=False, dir=args.temp)
    tailfastq.close(); tailfastq = tailfastq.name
    r, t, m = extractTails(bam, outFq=tailfastq, minLength=args.minTail)

    logging.info("Parsed %d reads" % (r))
    logging.info("Found %d tails" % (t))
    logging.info("%d reads had double tails" % (m))
    if t == 0:
        logging.info("No tails -- Exiting")
        exit(0)

    tailmap = tempfile.NamedTemporaryFile(suffix=".sam", delete=False, dir=args.temp)
    tailmap.close(); tailmap = tailmap.name

    callBlasr(tailfastq, args.reference, args.params, args.nproc, tailmap)
    bam.close() #reset
    return tailmap
项目:PBSuite    作者:dbrowneup    | 项目源码 | 文件源码
def __init__(self, task_queue, result_queue, bamName, referenceName, honH5): #*args, **kwargs):
        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.result_queue = result_queue
        self.bam = pysam.Samfile(bamName)
        if referenceName is not None:
            self.reference = pysam.Fastafile(referenceName)
        else:
            self.reference = None
        self.honH5 = honH5
        #self.args = args
        #self.kwargs = kwargs
项目:PBSuite    作者:dbrowneup    | 项目源码 | 文件源码
def __call__(self, bam, reference, honH5):
        """
        Takes a pysam.Samfile
        """
        logging.info("Starting %s" % (self.name))
        size = self.end - self.start
        regName =  "%s:%d-%d" % (self.chrom, self.start, self.end)

        logging.debug("Making container for %s (%s %d bp)" % (self.groupName, regName, size))

        logging.debug("Parsing bam" )
        st = max(0, self.start)
        readCount = bam.count(self.chrom, st, self.end)
        reads = bam.fetch(self.chrom, st, self.end)
        if readCount == 0:
            logging.warning("No reads found in %s" % self.groupName)
            self.failed = True
            self.errMessage = "No reads found in %s" % self.groupName
            return
        else:
            logging.info("%d reads to parse in %s" % (readCount, self.groupName))

        myData = self.countErrors(reads, self.start, size, self.args)

        #request loc on honH5 and flush results
        with honH5.acquireH5('a') as h5dat:
            out = h5dat.create_group(self.groupName)

            out.attrs["reference"] = self.chrom
            out.attrs["start"] = self.start
            out.attrs["end"] = self.end

            if size < CHUNKSHAPE[1]:
                chunk = (CHUNKSHAPE[0], size-1)
            else:
                chunk = CHUNKSHAPE

            container = out.create_dataset("data", data = myData, \
                                    chunks=chunk, compression="gzip")
            h5dat.flush()
项目:OpEx    作者:RahmanTeam    | 项目源码 | 文件源码
def __init__(self, threadidx, options, config, startline, endline, names):
        multiprocessing.Process.__init__(self)

        # Initializing variables
        self.threadidx = threadidx
        self.options = options
        self.config = config
        self.startline = startline
        self.endline = endline
        self.names = names

        # Initializing reads directory
        self.reads = dict()

        # Connecting to BAM file
        self.samfile = pysam.Samfile(options.input, "rb")

        # Connecting to transcript database file
        if not config['transcript_db'] is None:
            self.enstdb = pysam.Tabixfile(config['transcript_db'])
        else:
            self.enstdb = None

        # Initializing output files
        if int(options.threads) > 1:
            if config['outputs']['regions']:
                self.out_targets = open(options.output + '_regions_tmp_' + str(threadidx) + '.txt', 'w')
            if config['outputs']['profiles']:
                self.out_profiles = open(options.output + '_profiles_tmp_' + str(threadidx) + '.txt', 'w')
                if not config['transcript_db'] is None:
                    self.out_poor = open(options.output + '_poor_tmp_' + str(threadidx) + '.txt', 'w')
        else:
            if config['outputs']['regions']:
                self.out_targets = open(options.output + '_regions.txt', 'w')
            if config['outputs']['profiles']:
                self.out_profiles = open(options.output + '_profiles.txt', 'w')
                if not config['transcript_db'] is None:
                    self.out_poor = open(options.output + '_poor.txt', 'w')

    # Checking if target is fail or pass
项目:workflow    作者:hmkim    | 项目源码 | 文件源码
def igv_reads(bamfile, virus_contig):
    igv_chimeric_file = os.path.splitext(args.bamfile)[0] + ".chimeric.igv.bam"
    if os.path.exists(igv_chimeric_file):
        return igv_chimeric_file
    with pysam.Samfile(args.bamfile, "rb") as in_handle, \
         pysam.Samfile(igv_chimeric_file, "wb", template=in_handle) as out_handle:
        contig = in_handle.gettid(args.virus_contig)
        for read in in_handle:
            if skip_read(read, contig, args.virus_contig, in_handle, False):
                continue
            chrom = in_handle.getrname(read.tid)
            out_handle.write(read)
    return igv_chimeric_file
项目:workflow    作者:hmkim    | 项目源码 | 文件源码
def keep_chimeric_reads(bamfile, virus_contig):
    chimeric_file = os.path.splitext(args.bamfile)[0] + ".chimeric.bam"
    if os.path.exists(chimeric_file):
        return chimeric_file
    with pysam.Samfile(args.bamfile, "rb") as in_handle, \
         pysam.Samfile(chimeric_file, "wb", template=in_handle) as out_handle:
        contig = in_handle.gettid(args.virus_contig)
        for read in in_handle:
            if skip_read(read, contig, args.virus_contig, in_handle, True):
                continue
            chrom = in_handle.getrname(read.tid)
            out_handle.write(read)
    return chimeric_file
项目:workflow    作者:hmkim    | 项目源码 | 文件源码
def get_duplicates(bam_file):
    s = set()
    with pysam.Samfile(bam_file, "rb") as in_handle:
        for read in in_handle:
            if read.is_duplicate:
                s.update([read.qname])
    return s
项目:workflow    作者:hmkim    | 项目源码 | 文件源码
def get_duplicates(bam_file):
    s = set()
    with pysam.Samfile(bam_file, "rb") as in_handle:
        for read in in_handle:
            if read.is_duplicate:
                s.update([read.qname])
    return s
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def concatenate_and_fix_bams(out_bamfile, bamfiles, drop_tags=None):
    """Concatenate bams with different references and populate tags.

    Assumes that the input bams have completely non-overlapping reference entries.
    No sorting assumed. Output reads as in the same order as input.
    Tags are stripped from the header (which is assumed to look like an AugmentedFastqHeader)
    and are stored as bam tags. Tags in drop_tags are completely dropped.

    Args:
    - drop_tags: Set of tag names to ignore. If None, than all tags will be included.
    """
    template_bam = pysam.Samfile(bamfiles[0])
    header = template_bam.header

    for bam_fn in bamfiles[1:]:
        bam = pysam.Samfile(bam_fn)
        header['SQ'].extend(bam.header['SQ'])

    refname_to_tid = { ref_head['SN']:idx for (idx, ref_head) in enumerate(header['SQ']) }

    bam_out = pysam.Samfile(out_bamfile, "wb", header=header)

    for bam_fn in bamfiles:
        bam = pysam.Samfile(bam_fn)

        for rec in bam:
            if rec.is_unmapped and rec.mate_is_unmapped:
                rec.reference_id = -1
                rec.next_reference_id = -1
            elif rec.is_unmapped and not rec.mate_is_unmapped:
                rec.next_reference_id = refname_to_tid[bam.references[rec.next_reference_id]]
                rec.reference_id = rec.next_reference_id
            elif not rec.is_unmapped and rec.mate_is_unmapped:
                rec.reference_id = refname_to_tid[bam.references[rec.reference_id]]
                rec.next_reference_id = rec.reference_id
            else:
                rec.reference_id = refname_to_tid[bam.references[rec.reference_id]]
                rec.next_reference_id = refname_to_tid[bam.references[rec.next_reference_id]]

            # Pull fields out of qname, and make tags, writing out to bam_out
            qname_split = rec.qname.split(cr_fastq.AugmentedFastqHeader.TAG_SEP)
            rec.qname = qname_split[0]

            tags = rec.tags
            for i in range(1, len(qname_split), 2):
                tag = (qname_split[i], qname_split[i+1])
                # Don't add empty tags
                if len(tag[1]) > 0 and (drop_tags is None or not tag[0] in drop_tags):
                    tags.append(tag)

            rec.tags = tags
            bam_out.write(rec)
项目:CLAM    作者:Xinglab    | 项目源码 | 文件源码
def realigner(out_dir, tmp_dir, winsize=50, unstranded=False):
    """DOCSTRING
    Args:
    Returns:
    """
    # file handlers
    mbam = pysam.Samfile(os.path.join(tmp_dir, 'multi.sorted.bam'),'rb')
    ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
    obam = pysam.Samfile(os.path.join(out_dir, 'realigned.bam'), 'wb', template = mbam)
    chr_list=[x['SN'] for x in ubam.header['SQ']]

    # construct the mread_dict; this will be needed throughout
    mread_dict = defaultdict(list)
    for alignment in mbam:
        mread_dict[alignment.qname].append(alignment)

    # keep a record of processed reads
    processed_mreads = set()

    # iterate through all mreads
    for read_qname in mread_dict:
        if read_qname in processed_mreads:
            continue

        ## construct the fully-connected subgraph for each read
        read_to_locations, processed_mreads = \
            construct_subgraph(mbam, read_qname, mread_dict, processed_mreads, chr_list, winsize=winsize, unstranded=unstranded)
        subgraph = set()
        for read in read_to_locations:
            _ = map(subgraph.add, read_to_locations[read].keys())
        subgraph = list(subgraph)

        ## build the BIT tracks
        node_track, multi_reads_weights = \
            construct_BIT_track(subgraph, read_to_locations, ubam, unstranded)

        ## run EM
        multi_reads_weights = \
            run_EM(node_track, multi_reads_weights, w=winsize)

        ## write to obam
        for read in multi_reads_weights:
            for node in multi_reads_weights[read]:
                alignment = read_to_locations[read][node]
                score = round(multi_reads_weights[read][node][0], 3)
                alignment.set_tag('AS', score)
                alignment.set_tag('PG', 'CLAM')
                obam.write(alignment)
    # sort the final output
    logger.info('sorting output')
    obam.close()
    ubam.close()
    mbam.close()
    obam_sorted_fn = os.path.join(out_dir, 'realigned.sorted.bam')
    pysam.sort('-o', obam_sorted_fn, os.path.join(out_dir, 'realigned.bam'))
    pysam.index(obam_sorted_fn)
    os.remove(os.path.join(out_dir, 'realigned.bam'))
    return
项目:CLAM    作者:Xinglab    | 项目源码 | 文件源码
def peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=False, with_replicates=False, with_control=False, unstranded=False):
    """DOCSTRING
    Args:
    Returns:
    """
    # file handlers
    mbam = pysam.Samfile(os.path.join(out_dir, 'realigned.sorted.bam'),'rb')
    ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
    bam_dict = {'ubam.ip':[ubam], 'mbam.ip':[mbam]}
    if unique_only:
        ofile = open(os.path.join(out_dir, 'narrow_peaks.unique.bed'), 'w')
    else:
        ofile = open(os.path.join(out_dir, 'narrow_peaks.combined.bed'), 'w')

    # read in GTF
    logger.info('read gtf from "%s" '% fn)
    gene_annot = read_gtf(gtf_fp)

    # fetch tag counts in each gene
    ##  gene_name: { 'ip': bin_interval_ip, 'con': bin_interval_con }
    logger.info('reading gene counts')
    gene_count = defaultdict(dict)
    for gene_name in tqdm(gene_annot):
        gene = gene_annot[gene_name]
        bin_interval_ip, bin_interval_con = \
            gene_to_count(bam_dict, gene, 
                unique_only=unique_only, with_control=with_control, 
                unstranded=unstranded)
        if bin_interval_ip is None:  ## if no IP reads in this gene
            continue
        gene_count[gene_name]['ip'] = bin_interval_ip
        gene_count[gene_name]['con'] = bin_interval_con

    # estimate global overdispersion param. for each dataset
    logger.info('estimating dispersion parameter')
    alpha_ip_vec = estim_dispersion_param(len(bam_dict['ubam.ip']), gene_count, 'ip' )
    if with_control:
        alpha_con_vec = estim_dispersion_param(len(bam_dict['ubam.con']), gene_count, 'con' )
    else:
        alpha_con_vec = np.asarray(alpha_ip_vec)

    # perform statistical test
    logger.info('calling peaks')
    for gene_name in gene_to_count: 
        gene = gene_annot[gene_name]
        BED = test_gene_bin(gene, gene_count[gene_name]['ip'], gene_count[gene_name]['con'],
            alpha_ip_vec, alpha_con_vec)
        ofile.write(BED)
        peak_counter += len(BED.split('\n'))
    ofile.close()
    logger.info('called %i peaks'%peak_counter)

    return
项目:RSeQC    作者:MonashBioinformaticsPlatform    | 项目源码 | 文件源码
def genebody_coverage(bam, position_list):
    '''
    position_list is dict returned from genebody_percentile
    position is 1-based genome coordinate
    '''
    samfile = pysam.Samfile(bam, "rb")
    aggreagated_cvg = collections.defaultdict(int)

    gene_finished = 0
    for chrom, strand, positions in position_list.values():
        coverage = {}
        for i in positions:
            coverage[i] = 0.0
        chrom_start = positions[0]-1
        if chrom_start <0: chrom_start=0
        chrom_end = positions[-1]
        try:
            samfile.pileup(chrom, 1,2)
        except:
            continue

        for pileupcolumn in samfile.pileup(chrom, chrom_start, chrom_end, truncate=True):
            ref_pos = pileupcolumn.pos+1
            if ref_pos not in positions:
                continue
            if pileupcolumn.n == 0:
                coverage[ref_pos] = 0
                continue                
            cover_read = 0
            for pileupread in pileupcolumn.pileups:
                if pileupread.is_del: continue
                if pileupread.alignment.is_qcfail:continue 
                if pileupread.alignment.is_secondary:continue 
                if pileupread.alignment.is_unmapped:continue
                if pileupread.alignment.is_duplicate:continue
                cover_read +=1
            coverage[ref_pos] = cover_read
        tmp = [coverage[k] for k in sorted(coverage)]
        if strand == '-':
            tmp = tmp[::-1]
        for i in range(0,len(tmp)):
            aggreagated_cvg[i] += tmp[i]
        gene_finished += 1

        if gene_finished % 100 == 0:
            print >>sys.stderr, "\t%d transcripts finished\r" % (gene_finished),
    return  aggreagated_cvg
项目:mirtop    作者:miRTop    | 项目源码 | 文件源码
def read_bam(bam_fn, precursors, clean = True):
    """
    read bam file and perform realignment of hits
    """
    bam_fn = _sam_to_bam(bam_fn)
    bam_fn = _bam_sort(bam_fn)
    mode = "r" if bam_fn.endswith("sam") else "rb"
    handle = pysam.Samfile(bam_fn, mode)
    reads = defaultdict(hits)
    for line in handle:
        if line.reference_id < 0:
            logger.debug("Sequence not mapped: %s" % line.reference_id)
            continue
        query_name = line.query_name
        if query_name not in reads and line.query_sequence == None:
            continue
        if line.query_sequence and line.query_sequence.find("N") > -1:
            continue
        if query_name not in reads:
            reads[query_name].set_sequence(line.query_sequence)
            reads[query_name].counts = _get_freq(query_name)
        if line.is_reverse:
            logger.debug("Sequence is reverse: %s" % line.query_name)
            continue
        chrom = handle.getrname(line.reference_id)
        #  print "%s %s %s %s" % (line.query_name, line.reference_start, line.query_sequence, chrom)
        cigar = line.cigartuples
        iso = isomir()
        iso.align = line
        iso.set_pos(line.reference_start, len(reads[query_name].sequence))
        logger.debug("READ::From BAM start %s end %s" % (iso.start, iso.end))
        if len(precursors[chrom]) < line.reference_start + len(reads[query_name].sequence):
            continue
        iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence, precursors[chrom], line.reference_start, cigar)
        logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end))
        if len(iso.subs) < 2:
            reads[query_name].set_precursor(chrom, iso)
    logger.info("Hits: %s" % len(reads))
    if clean:
        reads = filter.clean_hits(reads)
        logger.info("Hits after clean: %s" % len(reads))
    return reads
项目:NGaDNAP    作者:theboocock    | 项目源码 | 文件源码
def main(argv):
    args = parse_args(argv)
    print args
    if args.input == "-" and sys.stdin.isatty():
        sys.stderr.write("STDIN is a terminal, terminating!\n")
        return 1
    elif sys.stdout.isatty():
        sys.stderr.write("STDOUT is a terminal, terminating!\n")
        return 1

    with pysam.Samfile(args.input, "rb") as infile:
        with pysam.Samfile("-", "wb", template=infile) as outfile:
            curpos = None
            curvariants = {}
            for (read_num, read) in enumerate(infile):
                if curpos and ((read.tid, read.pos) != curpos):
                    # Sort order is defined as ascending 'tid's and positions
                    if curpos > (read.tid, read.pos) and not read.is_unmapped:
                        sys.stderr.write("ERROR: Input file does not appear "
                                         "to be sorted by coordinates at "
                                         "record %i, aborting ...\n"
                                         % (read_num,))
                        return 1

                    _flush_buffer(outfile, curvariants,
                                  args.remove_duplicates)
                    curpos = None

                is_filtered = read.flag & _FILTERED_FLAGS
                is_collapsed = read.qname.startswith("M_")
                if is_filtered or not (read.qual and is_collapsed):
                    outfile.write(read)
                    continue

                curpos = (read.tid, read.pos)
                nkey = (read.is_reverse, read.pos, read.alen)
                if nkey in curvariants:
                    curvariants[nkey][0].append(read)
                    curvariants[nkey][1] += 1
                else:
                    curvariants[nkey] = [[read], 1]

            _flush_buffer(outfile, curvariants, args.remove_duplicates)

    return 0
项目:Comparative-Annotation-Toolkit    作者:ComparativeGenomicsToolkit    | 项目源码 | 文件源码
def setup_hints(job, input_file_ids):
    """
    Generates hints for a given genome with a list of BAMs. Will add annotation if it exists.
    """
    # RNA-seq hints
    filtered_bam_file_ids = {'BAM': collections.defaultdict(list), 'INTRONBAM': collections.defaultdict(list)}
    for dtype, bam_dict in input_file_ids['bams'].iteritems():
        if len(bam_dict) == 0:
            continue
        # Since BAMs are valid, we can assume that they all share the same header
        bam_file_id, bai_file_id = bam_dict.values()[0]
        bam_path = job.fileStore.readGlobalFile(bam_file_id)
        sam_handle = pysam.Samfile(bam_path)
        # triple disk usage to deal with name sorted bam
        disk_usage = tools.toilInterface.find_total_disk_usage([bam_file_id, bai_file_id]) * 3
        # generate reference grouping that will be used downstream until final cat step
        grouped_references = [tuple(x) for x in group_references(sam_handle)]
        for original_path, (bam_file_id, bai_file_id) in bam_dict.iteritems():
            for reference_subset in grouped_references:
                j = job.addChildJobFn(namesort_bam, bam_file_id, bai_file_id, reference_subset, disk_usage,
                                      disk=disk_usage, cores=4, memory='16G')
                filtered_bam_file_ids[dtype][reference_subset].append(j.rv())

    # IsoSeq hints
    iso_seq_hints_file_ids = []
    iso_seq_file_ids = input_file_ids['iso_seq_bams']
    if len(iso_seq_file_ids) > 0:
        for bam_file_id, bai_file_id in iso_seq_file_ids:
            disk_usage = tools.toilInterface.find_total_disk_usage([bam_file_id, bai_file_id])
            j = job.addChildJobFn(generate_iso_seq_hints, bam_file_id, bai_file_id, disk=disk_usage)
            iso_seq_hints_file_ids.append(j.rv())

    # protein hints
    if input_file_ids['protein_fasta'] is not None:
        disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids['protein_fasta'])
        j = job.addChildJobFn(generate_protein_hints, input_file_ids['protein_fasta'], input_file_ids['genome_fasta'],
                              disk=disk_usage)
        protein_hints_file_id = j.rv()
    else:
        protein_hints_file_id = None

    # annotation hints
    if input_file_ids['annotation'] is not None:
        disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids['annotation'])
        j = job.addChildJobFn(generate_annotation_hints, input_file_ids['annotation'], disk=disk_usage)
        annotation_hints_file_id = j.rv()
    else:
        annotation_hints_file_id = None
    return job.addFollowOnJobFn(merge_bams, filtered_bam_file_ids, annotation_hints_file_id,
                                iso_seq_hints_file_ids, protein_hints_file_id).rv()
项目:Comparative-Annotation-Toolkit    作者:ComparativeGenomicsToolkit    | 项目源码 | 文件源码
def namesort_bam(job, bam_file_id, bai_file_id, reference_subset, disk_usage, num_reads=50 ** 6):
    """
    Slices out the reference subset from a BAM, name sorts that subset, then chunks the resulting reads up for
    processing by filterBam.
    """
    def write_bam(r, ns_handle):
        """Write to the path, returns file ID"""
        outf = tools.fileOps.get_tmp_toil_file()
        outf_h = pysam.Samfile(outf, 'wb', template=ns_handle)
        for rec in r:
            outf_h.write(rec)
        outf_h.close()
        return job.fileStore.writeGlobalFile(outf)

    bam_path = job.fileStore.readGlobalFile(bam_file_id)
    is_paired = bam_is_paired(bam_path)
    job.fileStore.readGlobalFile(bai_file_id, bam_path + '.bai')
    name_sorted = tools.fileOps.get_tmp_toil_file(suffix='name_sorted.bam')
    cmd = [['samtools', 'view', '-b', bam_path] + list(reference_subset),
           ['sambamba', 'sort', '-t', '4', '-m', '15G', '-o', '/dev/stdout', '-n', '/dev/stdin']]
    tools.procOps.run_proc(cmd, stdout=name_sorted)
    ns_handle = pysam.Samfile(name_sorted)
    # this group may come up empty -- check to see if we have at least one mapped read
    try:
        _ = ns_handle.next()
    except StopIteration:
        return None
    # reset file handle to start
    ns_handle = pysam.Samfile(name_sorted)
    filtered_file_ids = []
    r = []
    for qname, reads in itertools.groupby(ns_handle, lambda x: x.qname):
        r.extend(list(reads))
        if len(r) >= num_reads:
            file_id = write_bam(r, ns_handle)
            j = job.addChildJobFn(filter_bam, file_id, is_paired, disk='4G', memory='2G')
            filtered_file_ids.append(j.rv())
            r = []
    # do the last bin, if its non-empty
    if len(r) > 0:
        file_id = write_bam(r, ns_handle)
        j = job.addChildJobFn(filter_bam, file_id, is_paired, disk='4G', memory='2G')
        filtered_file_ids.append(j.rv())
    return job.addFollowOnJobFn(merge_filtered_bams, filtered_file_ids, disk=disk_usage, memory='16G').rv()
项目:PBSuite    作者:dbrowneup    | 项目源码 | 文件源码
def test(argv):
    numpy.seterr(all="ignore")
    args = parseArgs(argv)
    setupLogging(True)#keep debug on.. you're testing!
    logging.critical(("Running HSpots.py directly implements testing mode. "
                      "If you're trying to run the full, actual program, use "
                      "Honey.py spots"))

    bam = pysam.Samfile(args.bam)
    reference = pysam.Fastafile(args.reference)
    try:
        if bam.header["HD"]["SO"] != "coordinate":
            logging.warning("BAM is not sorted by coordinates! Performance may be slower")
    except KeyError:
        logging.warning("Assuming BAM is sorted by coordinate. Be sure this is correct")
    logging.info("Running in test mode")

    #do what you will.. from here
    #spot = SpotResult(chrom='11', start=2215290, end=2215798, svtype="DEL", size=208)
    #spot = SpotResult(chrom='22', start=45964261, end=45965596, svtype="DEL", size=-1)
    # This is what I need to start with
    #spot = SpotResult(chrom="22", start=45963975, end=45964532, svtype="DEL", size=57)
    fh = open("honeymissing.bed")
    for line in fh.readlines():
        data = line.strip().split('\t')
        spot = SpotResult(chrom=data[0], start=int(data[1]), end = int(data[2]), \
                          size=int(data[3].split('=')[-1]), svtype="DEL")

        j = SpotCaller('group', spot.chrom, spot.start, spot.end, args)
        if j.supportingReadsFilter(spot, bam, args):
            consen = ConsensusCaller(spot, args)
            consen(bam, reference, 'none')
            for i in consen.newSpots:
                i.tags["seqmade"] = True
                print i
            if len(consen.newSpots) == 0:
                spot.tags["noseq"] = True
                print str(spot)
        else:
            spot.tags["filtfail"] = True
            print str(spot)

    #done with test code
    logging.info("Finished testing")