Python sklearn.model_selection 模块，GroupKFold() 实例源码

我们从Python开源项目中，提取了以下5个代码示例，用于说明如何使用sklearn.model_selection.GroupKFold()。

项目：scienceie17 作者：OC-ScienceIE | 项目源码 | 文件源码

def generate_folds(labels_fname, folds_fname, max_n_folds=10):
    """
    Generate folds for CV exps with n = 2, ..., max_n_folds.
    Save as pickled dict with n as key.
    """
    filenames = read_labels(labels_fname)['__filenames__']
    folds = {}

    for n in range(2, max_n_folds + 1):
        # Create folds from complete texts only
        # (i.e. instances/sentences of the same text are never in different folds).
        # There is no random seed, because the partitioning algorithm is deterministic.
        group_k_fold = GroupKFold(n_splits=n)
        # Don't bother to pass real X and Y, because they are not really used.
        folds[n] = list(group_k_fold.split(filenames, filenames, filenames))

    print('writing folds to ' + folds_fname)
    pickle.dump(folds, open(folds_fname, 'wb'))

项目：scienceie17 作者：OC-ScienceIE | 项目源码 | 文件源码

def get_train_test_fold_filenames(true_iob_dir, use_pickle=True):
    pickle_fname = '_train_test_fold_fnames.pkl'

    if use_pickle:
        try:
            return pickle.load(open(pickle_fname, 'rb'))
        except IOError:
            pass

    # Misuse data collecting function to get X, y and filenames.
    # Since we are not interested in the actual features, we pretend true_iob_dir is a feature dir.
    data = collect_crf_data(true_iob_dir, true_iob_dir)

    # Now create
    group_k_fold = GroupKFold(n_splits=5)

    # Create folds from complete texts only (i.e. instances of the same text are never in different folds)
    # Use same split for all three entities.
    # Note that there is no random seed, because the output of group_k_fold.split is deterministic
    # as long as the iob files are globbed in exactly the same order
    splits = group_k_fold.split(data['feats'], data['Material'], data['filenames'])

    fnames = np.array(data['filenames'])
    train_test_fold_fnames = []

    for train_idx, test_idx in splits:
        train_fnames = np.unique(fnames[train_idx])
        test_fnames = np.unique(fnames[test_idx])

        train_test_fold_fnames.append((train_fnames, test_fnames))

    pickle.dump(train_test_fold_fnames, open(pickle_fname, 'wb'))

    return train_test_fold_fnames

项目：hh-page-classifier 作者：TeamHG-Memex | 项目源码 | 文件源码

def build_folds(all_xs, all_ys, advice):
    domains = [get_domain(doc['url']) for doc in all_xs]
    n_domains = len(set(domains))
    n_relevant_domains = len(
        {domain for domain, is_relevant in zip(domains, all_ys) if is_relevant})
    n_folds = 4
    if n_relevant_domains == 1:
        advice.append(AdviceItem(
            WARNING,
            'Only 1 relevant domain in data means that it\'s impossible to do '
            'cross-validation across domains, '
            'and will likely result in model over-fitting.'
        ))
        folds = KFold(n_splits=n_folds).split(all_xs)
    else:
        folds = (GroupKFold(n_splits=min(n_domains, n_folds))
                 .split(all_xs, groups=domains))

    if 1 < n_relevant_domains < WARN_N_RELEVANT_DOMAINS:
        advice.append(AdviceItem(
            WARNING,
            'Low number of relevant domains (just {}) '
            'might result in model over-fitting.'.format(n_relevant_domains)
        ))
    folds = two_class_folds(folds, all_ys)
    if not folds:
        folds = two_class_folds(KFold(n_splits=n_folds).split(all_xs), all_ys)
    if not folds:
        advice.append(AdviceItem(
            WARNING,
            'Can not do cross-validation, as there are no folds where '
            'training data has both relevant and non-relevant examples. '
            'There are too few domains or the dataset is too unbalanced.'
        ))
    return folds

项目：AutoSleepScorerDev 作者：skjerns | 项目源码 | 文件源码

def train_models(data, targets, groups,model=None, cropsize=2800, batch_size=512, epochs=250, epochs_to_stop=15,rnn_epochs_to_stop=15):
    """
    trains a cnn3adam_filter_l2 model with a LSTM on top on 
    the given data with 20% validation set and returns the two models
    """
    input_shape = list((np.array(data[0])).shape) #train_data.shape
    input_shape[0] = cropsize
    n_classes = targets.shape[1]
    train_idx, val_idx = GroupKFold(5).split(groups,groups,groups).__next__()
    train_data   = [data[i] for i in train_idx]
    train_target = targets[train_idx]
    train_groups = groups[train_idx]
    val_data     = [data[i] for i in val_idx]
    val_target   = targets[val_idx]
    val_groups   = groups[val_idx]
    model = models.cnn3adam_filter_l2(input_shape, n_classes) if model is None else model(input_shape, n_classes)
    g_train= generator(train_data, train_target, batch_size, val=False, cropsize=cropsize)
    g_val  = generator(val_data, val_target, batch_size, val=True, cropsize=cropsize)
    cb  = Checkpoint_balanced(g_val, verbose=1, groups=val_groups,
                              epochs_to_stop=epochs_to_stop, plot = True, name = '{}, {}'.format(model.name, 'testing'))
    model.fit_generator(g_train, g_train.n_batches, epochs=epochs, callbacks=[cb], max_queue_size=1, verbose=0)
    val_acc = cb.best_acc
    val_f1  = cb.best_f1
    print('CNN Val acc: {:.1f}, Val F1: {:.1f}'.format(val_acc*100, val_f1*100))

    # LSTM training
    rnn_modelfun = models.pure_rnn_do
    lname = 'fc1'
    seq = 6
    rnn_epochs = epochs
    stopafter_rnn = rnn_epochs_to_stop
    features = get_activations(model, train_data + val_data, lname, batch_size*2, cropsize=cropsize)
    train_data_extracted = features[0:len(train_data)]
    val_data_extracted   = features[len(train_data):]
    assert (len(train_data)==len(train_data_extracted)) and (len(val_data)==len(val_data_extracted))
    train_data_seq, train_target_seq, train_groups_seq = tools.to_sequences(train_data_extracted, train_target,groups=train_groups, seqlen=seq)
    val_data_seq, val_target_seq, val_groups_seq       = tools.to_sequences(val_data_extracted,   val_target,  groups=val_groups, seqlen=seq)
    rnn_shape  = list((np.array(train_data_seq[0])).shape)
    neurons = int(np.sqrt(rnn_shape[-1])*4)
    rnn_model  = rnn_modelfun(rnn_shape, n_classes, layers=2, neurons=neurons, dropout=0.3)
    print('Starting RNN model with input from layer fc1: {} at {}'.format(rnn_model.name, rnn_shape, time.ctime()))
    g_train= generator(train_data_seq, train_target_seq, batch_size, val=False)
    g_val  = generator(val_data_seq, val_target_seq, batch_size, val=True)
    cb = Checkpoint_balanced(g_val, verbose=1, groups = val_groups_seq, 
                             epochs_to_stop=stopafter_rnn, plot = True, name = '{}, {}'.format(rnn_model.name,'fc1'))         
    rnn_model.fit_generator(g_train, g_train.n_batches, epochs=rnn_epochs, verbose=0, callbacks=[cb],max_queue_size=1)    
    val_acc = cb.best_acc
    val_f1  = cb.best_f1
    print('LSTM Val acc: {:.1f}, Val F1: {:.1f}'.format(val_acc*100, val_f1*100))

    return model, rnn_model

项目：AutoSleepScorerDev 作者：skjerns | 项目源码 | 文件源码

def train_models_feat(data, targets, groups, batch_size=512, epochs=250, epochs_to_stop=15):
    """
    trains a ann and rnn model with features
    the given data with 20% validation set and returns the two models
    """
    batch_size = 512
    input_shape = list((np.array(data[0])).shape) #train_data.shape
    n_classes = targets.shape[1]
    train_idx, val_idx = GroupKFold(5).split(groups,groups,groups).__next__()
    train_data   = [data[i] for i in train_idx]
    train_target = targets[train_idx]
    train_groups = groups[train_idx]
    val_data     = [data[i] for i in val_idx]
    val_target   = targets[val_idx]
    val_groups   = groups[val_idx]
    model = models.ann(input_shape, n_classes)
    g_train= generator(train_data, train_target, batch_size, val=False)
    g_val  = generator(val_data, val_target, batch_size, val=True)
    cb  = Checkpoint_balanced(g_val, verbose=1, groups=val_groups,
                              epochs_to_stop=epochs_to_stop, plot = True, name = '{}, {}'.format(model.name, 'testing'))
    model.fit_generator(g_train, g_train.n_batches, epochs=epochs, callbacks=[cb], max_queue_size=1, verbose=0)
    val_acc = cb.best_acc
    val_f1  = cb.best_f1
    print('CNN Val acc: {:.1f}, Val F1: {:.1f}'.format(val_acc*100, val_f1*100))

    # LSTM training
    batch_size = 512
    n_classes = targets.shape[1]
    train_idx, val_idx = GroupKFold(5).split(groups,groups,groups).__next__()
    train_data   = np.array([data[i] for i in train_idx])
    train_target = targets[train_idx]
    train_groups = groups[train_idx]
    val_data     = np.array([data[i] for i in val_idx])
    val_target   = targets[val_idx]
    val_groups   = groups[val_idx]

    train_data_seq, train_target_seq, train_groups_seq = tools.to_sequences(train_data, train_target, groups=train_groups, seqlen=6)
    val_data_seq, val_target_seq, val_groups_seq        = tools.to_sequences(val_data, val_target, groups=val_groups, seqlen=6)

    input_shape = list((np.array(train_data_seq[0])).shape) #train_data.shape
    print(input_shape)
    rnn_model = models.pure_rnn_do(input_shape, n_classes)

    g_train = generator(train_data_seq, train_target_seq, batch_size, val=False)
    g_val   = generator(val_data_seq, val_target_seq, batch_size, val=True)
    cb  = Checkpoint_balanced(g_val, verbose=1, groups=val_groups_seq,
                              epochs_to_stop=epochs_to_stop, plot = True, name = '{}, {}'.format(rnn_model.name, 'testing'))
    rnn_model.fit_generator(g_train, g_train.n_batches, epochs=epochs, callbacks=[cb], max_queue_size=1, verbose=0)
    val_acc = cb.best_acc
    val_f1  = cb.best_f1
    print('CNN Val acc: {:.1f}, Val F1: {:.1f}'.format(val_acc*100, val_f1*100))



    return model, rnn_model