Python Bio.SeqIO 模块,to_dict() 实例源码

我们从Python开源项目中,提取了以下17个代码示例,用于说明如何使用Bio.SeqIO.to_dict()

项目:mitre    作者:gerberlab    | 项目源码 | 文件源码
def fasta_to_dict(fasta_filename):
    # not clear the use of biopython gains anything here
    with open(fasta_filename) as f:
        return {k: str(v.seq) for k,v in 
                SeqIO.to_dict(SeqIO.parse(f,'fasta')).iteritems()}
项目:zika-pipeline    作者:zibraproject    | 项目源码 | 文件源码
def main():
        records = SeqIO.to_dict(SeqIO.parse(open(sys.argv[1]), 'fasta'))

    reader = csv.DictReader(sys.stdin, dialect="excel-tab")
    clusters = list(reader)

    groups = set([c['group'] for c in clusters])

    for group in groups:
        print "cluster%s\t%s-cluster%s" % (group, sys.argv[1], group)
        with open('%s-cluster%s' %(sys.argv[1], group), 'w') as fout:
            SeqIO.write([records[i['node']] for i in clusters if i['group'] == group], fout, 'fasta')
项目:amplicon_sequencing_pipeline    作者:thomasgurry    | 项目源码 | 文件源码
def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None):
        '''
        seq_table: pandas.DataFrame
          Samples on the columns; sequences on the rows
        records: index of Bio.Seq
          Indexed, unaligned input sequences. This could come from BioPython's
          SeqIO.to_dict or SeqIO.index.
        max_dist: float
          genetic distance cutoff above which a sequence will not be merged into an OTU
        min_fold: float
          Multiply the sequence's abundance by this fold to get the minimum abundance
          of an OTU for merging
        threshold_pval: float
          P-value below which a sequence will not be merged into an OTU
        log: filehandle
          Log file reporting the abundance, genetic, and distribution checks.
        '''
        self.seq_table = seq_table
        self.records = records
        self.max_dist = max_dist
        self.min_fold = min_fold
        self.threshold_pval = threshold_pval
        self.log = log

        # get a list of the names of the sequences in order of their (decreasing) abundance
        self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False)

        # check that all sequence IDs in the table are in the fasta
        missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records]
        if len(missing_ids) > 0:
            raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids))

        # initialize OTU information
        self.membership = {}
        self.otus = []
项目:genepred    作者:egorbarsukoff    | 项目源码 | 文件源码
def extract_genes(output_folder, refs_folder, gff_folder, di):
    tp = 'gene'
    refs_files = sorted([join(refs_folder, f) for f in listdir(refs_folder) if isfile(join(refs_folder, f))])
    gff_files = sorted([join(gff_folder, f) for f in listdir(gff_folder) if isfile(join(gff_folder, f))])
    limit_info = dict( gff_type =['gene'])
    sequences = []
    for ind in range(len(refs_files)):
        f_fasta = refs_files[ind]
        f_gff = gff_files[ind]
        nm = f_gff.split("/")[-1][:-4]
        record = SeqIO.to_dict(SeqIO.parse(f_fasta, "fasta"))
        new_record = {}
        p = re.compile("N\w_\w+\.\d")
        for it in record:
            new_id = p.findall(it)[0]
            new_record[new_id] = record[it]

        record = new_record
        in_handle = open(f_gff)
        for rec in GFF.parse(in_handle, limit_info=limit_info, base_dict=record):
            for f in rec.features:
                if f.qualifiers['ID'][0] in di.get(f_gff, []):
                    subrecord = SeqRecord.SeqRecord(Seq.Seq(re.sub(r'[^ATGC]', 'A', str(f.extract(rec.seq)))),
                                       id=nm + "_" + rec.id + "_" + str(f.location),
                                       name=nm + "_" + str(f.location)+ "_" + tp,
                                       description= tp)
                    sequences.append(subrecord)
        in_handle.close()
    with open(output_folder+'sc_genes_seq.fasta', "w") as output_handle:
        SeqIO.write(sequences, output_handle, "fasta")
项目:genepred    作者:egorbarsukoff    | 项目源码 | 文件源码
def partial_genes_extract(contigs_path, outdir):
    contigs = SeqIO.to_dict(SeqIO.parse(contigs_path, "fasta"))
    partial_genes = []
    with open(outdir+'prodigal_output.gff') as f:
        for rec in GFF.parse(f, base_dict=contigs):
            for feature in rec.features:
                extract_seq = str(feature.extract(rec.seq))
                if feature.qualifiers['partial'][0] in ['11', '01', '10'] and len(extract_seq) > 110:
                    subrecord = SeqRecord.SeqRecord(Seq.Seq(extract_seq),
                                                    id='gene_'+str(len(partial_genes))+'_len_'+str(len(extract_seq)) +
                                                    '_conf_'+str(feature.qualifiers['conf'][0]),
                                                    description='patrial gene')
                    partial_genes.append(subrecord)
    return partial_genes
项目:genepred    作者:egorbarsukoff    | 项目源码 | 文件源码
def write_genes(genes_align, outdir):
    records = []
    orfs = SeqIO.to_dict(SeqIO.parse(outdir+'ORFs.fasta', 'fasta'))
    for gene_cluster in genes_align:
        for orf in genes_align[gene_cluster][1]:
            if orfs[orf].seq is not None:
                records.append(SeqRecord.SeqRecord(seq=orfs[orf].seq, id='{0}_{1}'.format(gene_cluster, orf),
                                                   name='{0}_{1}_conf={2}_len={3}'.format(gene_cluster,
                                                                                          orf,
                                                                                          genes_align[gene_cluster][0],
                                                                                          len(orfs[orf]))))
    SeqIO.write(records, outdir+'results.fasta', 'fasta')
项目:LoReAn    作者:lfaino    | 项目源码 | 文件源码
def add_EVM(whole_fasta_name, output_filename, output_merged_fasta_name):
    """
    this module looks for genes that were not used in the consensus stage. usually are gene models without long reads
    support
    """
    sys.stdout.write('\t###APPEND EVM NOT USED FROM CONTIGS BUILDING###\n')
    '''Adds the EVM records that are not present in the final contig evidence'''
    whole_fasta = open(whole_fasta_name, 'r')
    out_fasta_file = open(output_filename, 'r')
    outputMerged = open(output_merged_fasta_name, 'w')
    wholeDict = SeqIO.to_dict(SeqIO.parse(whole_fasta, 'fasta'))
    count = 0
    dictOut = {}
    outFasta = SeqIO.parse(out_fasta_file, 'fasta')
    for record in outFasta:
        if record.id in dictOut:
            dictOut[str(record.id) + '_' + str(count)] = str(record.seq)
            count += 1
        else:
            dictOut[record.id] = str(record.seq)
    for key in list(wholeDict.keys()):
        if 'evm' in key and key not in dictOut:
            ident = '>Gene' + str(count) + '_' + key
            outputMerged.write(
                ident + '\n' + str(wholeDict[key].seq) + '\n')
            count += 1
    for key, element in list(dictOut.items()):
        ident = '>Gene' + str(count) + '_' + key
        outputMerged.write(ident + '\n' + str(element) + '\n')
        count += 1

    whole_fasta.close()
    outFasta.close()
    outputMerged.close()
项目:LoReAn    作者:lfaino    | 项目源码 | 文件源码
def fasta2Dict(fasta_filename):
    """
    Prepare a dictionary of all the sequences that is used together with
    the fasta file to make single fasta files for the assembly
    """
    fasta_file = open(fasta_filename, 'r')
    fasta_dict2 = SeqIO.to_dict(SeqIO.parse(fasta_file, 'fasta'))
    fasta_dict = {}
    for key, seq2 in list(fasta_dict2.items()):
        seq = str(seq2.seq).replace("N", "")
        fasta_dict[key] = seq
        del fasta_dict2[key]
    fasta_file.close()
    return fasta_dict
项目:dsde-deep-learning    作者:broadinstitute    | 项目源码 | 文件源码
def dna_from_reference(chrom='9'):
    #reference_hg19 = '/dsde/data/deep/vqsr/Homo_sapiens_assembly19.fasta'
    reference_hg19 = '/Users/sam/vqsr_data/Homo_sapiens_assembly19.fasta'

    record_dict = SeqIO.to_dict(SeqIO.parse(reference_hg19, "fasta"))
    dna = str(record_dict[chrom].seq[10000000:75000000])
    return dna
项目:dsde-deep-learning    作者:broadinstitute    | 项目源码 | 文件源码
def dna_from_reference(chrom='21'):
    reference_hg19 = '/dsde/data/deep/vqsr/Homo_sapiens_assembly19.fasta'
    record_dict = SeqIO.to_dict(SeqIO.parse(reference_hg19, "fasta"))
    dna = str(record_dict[chrom].seq[35000000:37000000])
    return dna
项目:proteinER    作者:clauswilke    | 项目源码 | 文件源码
def main():
    '''
    Back translate aligned amino acid sequences to nucleotide sequences while maintaining the alignment
    '''
    #creating a parser
    parser = argparse.ArgumentParser(
            formatter_class=argparse.RawDescriptionHelpFormatter,
            description='Translate amino acid sequences into nucleotide sequences while maintaining the amino acid alignment and the order of sequences.')
    #adding arguments 
    parser.add_argument('-a', metavar='<aa_aln.fasta>', type=str, 
    help='input amino acid alignment file')
    parser.add_argument('-n', metavar='<nuc_aln.fasta>', type=str, 
    help='input unaligned/aligned nucleotide/codon sequence file')
    parser.add_argument('-o', metavar='<codon_aln.fasta>', type=str, 
    help='output codon alignment file') 
    args = parser.parse_args()

    #set up output file name if none is given
    if args.o is None:
        nuc_aln_file = "codon_aln.fasta"
    else:
        nuc_aln_file = args.o

    aa_aln_file = args.a
    nuc_seq_file = args.n

    aa_aln = AlignIO.read(aa_aln_file, "fasta") 

    nuc_seq = open(nuc_seq_file) #read in codon file
    codon_dict = SeqIO.to_dict(SeqIO.parse(nuc_seq, "fasta")) #read in codon sequence file as a dictionary, key=sequence ID, value=sequence itself

    back_translate(aa_aln,codon_dict,nuc_aln_file,gencode)
项目:wub    作者:nanoporetech    | 项目源码 | 文件源码
def read_seq_records_dict(input_object, format='fasta'):
    """Read SeqRecord objects to a dictionary from a file in the specified format.

    :param input_object: A file object or a file name.
    :param format: Input format (fasta by default).
    :returns: An iterator of SeqRecord objects.
    :rtype: dict

    """
    handle = input_object
    if type(handle) == str:
        handle = open(handle, "rU")
    return SeqIO.to_dict(SeqIO.parse(handle, format))
项目:picea-sitchensis-plastid    作者:sjackman    | 项目源码 | 文件源码
def main(gff_file, fasta_file):
    out_file = "%s.gb" % os.path.splitext(gff_file)[0]
    fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna))
    gff_iter = GFF.parse(gff_file, fasta_input)
    SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter)), out_file, "genbank")
项目:LoReAn    作者:lfaino    | 项目源码 | 文件源码
def parse_contigs(outputAssembly, threshold_float, verbose):
    """
    Parses the output from iAssembler, to a single FASTA file
    """

    if verbose:
        sys.stderr.write('Executing: Parse assembled consensus\n')
    fname = outputAssembly + 'contig_member'
    fname_exists = os.path.isfile(fname)
    if fname_exists:
        # part all the files in the tmp assembly folder
        contigInfo = open(outputAssembly + 'contig_member', 'r')
        contigs = {}
        total_reads = 0
        for line in contigInfo:
            line = line.strip()
            line = line.split('\t')
            # count the reads for each assembled Unitig
            read_number = len(line) - 1
            for element in line:
                if 'evm' in element:
                    contigs[line[0]] = [read_number, element]
            if line[0] not in contigs:
                contigs[line[0]] = [read_number]
            # sum all the reads whitin one cluster
            total_reads += read_number
        contigInfo.close()
        # calculate the number of reads considering the threshold
        threshold = total_reads * float(threshold_float)
        global count_sequences
        real_contigs = {}
        count_unitigs = 1
        for key, element in list(contigs.items()):
            # to retrieve only supported assembly
            if len(element) == 2:
                if element[0] >= threshold and 'evm' in element[1]:
                    real_contigs[key] = element[1] + ' ' + str(
                        element[0]) + '_above_threshold_' + str(threshold) + ' loc_' + str(count_sequences)
                elif element[0] < threshold and 'evm' in element[1]:
                    real_contigs[key] = element[1] + ' ' + str(
                        element[0]) + '_below_threshold_' + str(threshold) + ' loc_' + str(count_sequences)
            elif len(element) == 1:
                if element[0] >= threshold:
                    real_contigs[key] = 'Unitig' + str(count_sequences) + '_' + str(count_unitigs) + ' ' + str(
                        element[0]) + '_above_threshold_' + str(threshold) + ' loc_' + str(count_sequences)
                    count_unitigs += 1
        # writes the outputs
        fileAssembly = outputAssembly + 'unigene_seq.new.fasta'
        contigSeq = open(fileAssembly, 'r')
        contigDict = SeqIO.to_dict(SeqIO.parse(contigSeq, 'fasta'))
        output_filename = outputAssembly[:-1] + '_assembled.fasta'
        outputFile = open(output_filename, 'w')
        for iden, write_iden in list(real_contigs.items()):
            if iden in contigDict:
                outputFile.write('>' + write_iden + '\n' + str(contigDict[iden].seq) + '\n')
        contigSeq.close()
        outputFile.close()
    return
项目:dsde-deep-learning    作者:broadinstitute    | 项目源码 | 文件源码
def load_dna_and_chrom_label(args, only_labels=None):
    record_dict = SeqIO.to_dict(SeqIO.parse(args.reference_fasta, "fasta"))
    bed_dict, bed_labels = bed_file_labels_to_dict(args.bed_file)

    train_data = np.zeros(( args.samples, args.window_size, len(args.inputs) ))
    train_labels = np.zeros(( args.samples, len(bed_labels) ))

    idx_offset = (args.window_size/2)

    amiguity_codes = {'K':[0,0,0.5,0.5], 'M':[0.5,0.5,0,0], 'R':[0.5,0,0,0.5], 'Y':[0,0.5,0.5,0], 'S':[0,0.5,0,0.5], 
                      'W':[0.5,0,0.5,0], 'B':[0,0.333,0.333,0.334], 'V':[0.333,0.333,0,0.334],'H':[0.333,0.333,0.334,0],
                      'D':[0.333,0,0.333,0.334], 'X':[0.25,0.25,0.25,0.25], 'N':[0.25,0.25,0.25,0.25]}

    count = 0
    while count < args.samples:
        contig_key, pos = sample_from_bed(bed_dict, contig_key_prefix='chr')
        contig = record_dict[contig_key]
        record = contig[pos-idx_offset: pos+idx_offset]

        cur_label_key = bed_file_label(bed_dict, contig_key, pos)

        if only_labels and not cur_label_key in only_labels:
            continue

        train_labels[count, args.labels[cur_label_key]] = 1

        for i,b in enumerate(record.seq):
            B=b.upper()
            if B in args.inputs.keys():
                train_data[count, i, args.inputs[B]] = 1.0
            elif B in amiguity_codes.keys():
                train_data[count, i, :4] = amiguity_codes[B]
            else:
                print('Error! Unknown code:', b)
                return

        count += 1

    print('Label:', bed_labels.keys(), 'label counts:', np.sum(train_labels, axis=0))
    print('Train data shape:', train_data.shape, ' Training labels shape:', train_labels.shape)

    return (train_data, train_labels)
项目:MIRA    作者:comprna    | 项目源码 | 文件源码
def extract_kmer_seq(kmer_bedfile, hg19_fa_file, outfile_kmers_seq):
    '''
    open bed file to write the kmer sequence
    ''' 
    with open(outfile_kmers_seq,'w') as of:
        pass

    wf = open(outfile_kmers_seq,'a+')

    # read names and postions from bed file
    records = SeqIO.to_dict(SeqIO.parse(open(hg19_fa_file), 'fasta'))

    with open(kmer_bedfile) as rf:
        for line in rf:
            name,start,stop,gene,val,strand,rem = bedline_split(line)
            start = start
            long_seq_record = records[name]
            long_seq = long_seq_record.seq
            alphabet = long_seq.alphabet
            short_seq = str(long_seq)[start-1:stop]
            short_seq_record = SeqRecord(Seq(short_seq, alphabet))
            short_seq_record.seq.strip()

            '''
            If the sequence is in reverse strand, rev complement it
            '''
            if strand == "+":
                kmer_seq = short_seq_record.seq
            elif strand == "-":
                kmer_seq = short_seq_record.seq.reverse_complement() 

            else:
                print("Strand information not found for line:\n {}".format(line))
                #quit()

            wf.write("{}\t{}\t{}\t{}\t{}\t{}{}\n".format(name,start,stop,gene,kmer_seq,strand,rem))

    rf.close()
    wf.close()
    of.close()

    file_chk = check_files(outfile_kmers_seq)
    return(file_chk)
项目:MIRA    作者:comprna    | 项目源码 | 文件源码
def extract_kmer_seq(kmer_bedfile, hg19_fa_file, outfile_kmers_seq):
    '''
    open bed file to write the kmer sequence
    ''' 
    with open(outfile_kmers_seq,'w') as of:
        pass

    wf = open(outfile_kmers_seq,'a+')

    # read names and postions from bed file
    records = SeqIO.to_dict(SeqIO.parse(open(hg19_fa_file), 'fasta'))

    with open(kmer_bedfile) as rf:
        for line in rf:
            name,start,stop,gene,val,strand = bedline_split(line)
            long_seq_record = records[name]
            long_seq = long_seq_record.seq
            alphabet = long_seq.alphabet
            short_seq = str(long_seq)[start-1:stop]
            short_seq_record = SeqRecord(Seq(short_seq, alphabet))
            short_seq_record.seq.strip()

            '''
            If the sequence is in reverse strand, rev complement it
            #edit 14-march-2017 (to extract kmers strictly to half open bed format)
            '''
            if strand == "+":
                kmer_seq = short_seq_record.seq
                #kmer created will be one base greater than the kmer size, trim
                kmer_seq = kmer_seq[:1]

            elif strand == "-":
                kmer_seq = short_seq_record.seq.reverse_complement() 
                kmer_seq = kmer_seq[:-1]
            else:
                print("Strand information not found for line:\n {}".format(line))
                quit()

            wf.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(name,start,stop,gene,kmer_seq,strand))

    rf.close()
    wf.close()

    file_chk = check_files(outfile_kmers_seq)
    return(file_chk)