Python sklearn.ensemble 模块,IsolationForest() 实例源码

我们从Python开源项目中,提取了以下38个代码示例,用于说明如何使用sklearn.ensemble.IsolationForest()

项目:HousePrices    作者:MizioAnd    | 项目源码 | 文件源码
def outlier_prediction(x_train, y_train):
        # Use built-in isolation forest or use predicted vs. actual
        # Compute squared residuals of every point
        # Make a threshold criteria for inclusion

        # The prediction returns 1 if sample point is inlier. If outlier prediction returns -1
        rng = np.random.RandomState(42)
        clf_all_features = IsolationForest(max_samples=100, random_state=rng)
        clf_all_features.fit(x_train)

        # Predict if a particular sample is an outlier using all features for higher dimensional data set.
        y_pred_train = clf_all_features.predict(x_train)

        # Exclude suggested outlier samples for improvement of prediction power/score
        outlier_map_out_train = np.array(map(lambda x: x == 1, y_pred_train))
        x_train_modified = x_train[outlier_map_out_train, ]
        y_train_modified = y_train[outlier_map_out_train, ]

        return x_train_modified, y_train_modified
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def updateWindow(window,buf,maxContainSize):
    if len(buf) >= maxContainSize:#??buf??
        print "buffer full "
        window = clusteringReminMost(window)
        print "window size after clustering without adding buffer :",len(window)
        for i in buf:
            window.append(i)
        ilf = IsolationForest(n_estimators=60)
        ilf.fit(window)
        print "isolation update finished"

    else:                       #???????buf????
        print "higher than threads"
        for i in buf:
            window.append(i)
        ilf = IsolationForest(n_estimators=60)
        ilf.fit(window)
        print "isolation update finished"
    return window,ilf
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def init(idlist,d,dblack,outcome,winsize=200,sleeptime = 5):
    #????
    window =  []
    while True:
        print "fetching at %s" %ctime()
        data = getdata()
        loadvalue(data, d,dblack)
        outvalue = extract(d,idlist)
        window.append(outvalue)
        if len(window) > winsize:
            break
        sleep(sleeptime)
    #?????
    ilf = IsolationForest(n_estimators=60)
    ilf.fit(window)
    print ilf.predict(window)
    for i in ilf.predict(window):
        outcome.append(i)
    #??
    return ilf,window
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def updateWindow(window,buf,maxContainSize):
    if len(buf) >= maxContainSize:#??buf??
        print "buffer full "
        window = clusteringReminMost(window)
        print "window size after clustering without adding buffer :",len(window)
        for i in buf:
            window = window.append(i)
        ilf = IsolationForest(n_estimators=100,verbose=2,)
        ilf.fit(window)
        print "isolation update finished"

    else:                       #???????buf????
        print "higher than threads"
        for i in buf:
            window = window.append(i)
        ilf = IsolationForest(n_estimators=100,verbose=2,)
        ilf.fit(window)
        print "isolation update finished"
    return window,ilf
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def updateWindow(window,buf,maxContainSize):
    if len(buf) >= maxContainSize:#??buf??
        print window################################################
        print "buffer full "
        window = clusteringReminMost(window)
        print "window size after clustering without adding buffer :",len(window)
        for i in buf:
            window.append(i)
            #print i
        ilf = IsolationForest(n_estimators=100)
        ilf.fit(window)
        print "isolation update finished"

    else:                       #???????buf????
        print "higher than threads"
        for i in buf:
            window.append(i)
        ilf = IsolationForest(n_estimators=100)
        ilf.fit(window)
        print "isolation update finished"
    return window,ilf
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def updateWindow(window,buf,maxContainSize):
    if len(buf) >= maxContainSize:#??buf??
        print "buffer full "
        window = clusteringReminMost(window)
        print "window size after clustering without adding buffer :",len(window)
        for i in buf:
            window.append(i)
        ilf = IsolationForest(n_estimators=100,verbose=2,)
        ilf.fit(window)
        print "isolation update finished"

    else:                       #???????buf????
        print "higher than threads"
        for i in buf:
            window.append(i)
        ilf = IsolationForest(n_estimators=100,verbose=2,)
        ilf.fit(window)
        print "isolation update finished"
    return window,ilf
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def updateWindow(window,buf,maxContainSize):
    if len(buf) >= maxContainSize:#??buf??
        print "buffer full "
        for i in buf:
            window.append(i)
        ilf = IsolationForest(n_estimators=100,contamination=0.01)
        ilf.fit(window)
        print "isolation update finished"

    else:                       #???????buf????
        print "higher than threads"
        for i in buf:
            window.append(i)
        ilf = IsolationForest(n_estimators=100,contamination=0.01)
        ilf.fit(window)
        print "isolation update finished"
    return window,ilf
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def init(idlist,d,dblack,outcome,winsize=200,sleeptime = 5):
    #????
    window =  []
    while True:
        print "fetching at %s" %ctime()
        data = getdata()
        loadvalue(data, d,dblack)
        outvalue = extract(d,idlist)
        window.append(outvalue)
        if len(window) > winsize:
            break
        sleep(sleeptime)
    #?????
    ilf = IsolationForest(n_estimators=100,contamination=0.01)
    ilf.fit(window)
    print ilf.predict(window)
    for i in ilf.predict(window):
        outcome.append(i)
    #??
    return ilf,window
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def updateWindow(l_sys, l_namenode, l_FS, l_RPC,cont):
    ilf = IsolationForest(n_estimators=100, contamination=cont)
    query = 'select * from ganglia where w_fs >0 and w_namenode>0 and w_rpc >0 limit 1024;'  # ???? ???
    client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb')
    result = client.query(query, chunked=False)
    data = result['ganglia']
    d_sys = data[l_sys]
    d_namenode = data[l_namenode]
    d_FS = data[l_FS]
    d_RPC = data[l_RPC]

    ilf_sys = IsolationForest(n_estimators=100, contamination=cont)
    ilf_namenode = IsolationForest(n_estimators=100, contamination=cont)
    ilf_FS = IsolationForest(n_estimators=100, contamination=cont)
    ilf_RPC = IsolationForest(n_estimators=100, contamination=cont)

    ilf_sys.fit(d_sys)
    ilf_namenode.fit(d_namenode)
    ilf_FS.fit(d_FS)
    ilf_RPC.fit(d_RPC)

    print "update finished"
    return ilf_sys,ilf_namenode,ilf_FS,ilf_RPC
项目:Bacchus    作者:surfstudio    | 项目源码 | 文件源码
def transform(self, X, **transform_params):
        if X.shape[0] < 1/self.contamination:
            return X
        self.isolation_forest = IsolationForest(contamination=self.contamination,
                                                n_estimators=self.n_estimators,
                                                n_jobs=self.n_jobs)
        to_analyze = self._columns_to_apply(X)
        if to_analyze is None:
            to_analyze = self._numeric_columns(X)
        rest = self._rest_columns(X, to_analyze)
        self.isolation_forest.fit(to_analyze)
        labels = self.isolation_forest.predict(to_analyze)
        to_analyze['_outlier'] = labels; to_analyze = to_analyze[to_analyze['_outlier'] == 1];
        del(to_analyze['_outlier'])
        rest['_outlier'] = labels; rest = rest[rest['_outlier'] == 1]; del(rest['_outlier'])
        if self.verbose:
            print('%s Now has %s' % (self.class_name, to_analyze.shape[0]))
        return pd.concat((to_analyze, rest), axis=1)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_iforest_error():
    """Test that it gives proper exception on deficient input."""
    X = iris.data

    # Test max_samples
    assert_raises(ValueError,
                  IsolationForest(max_samples=-1).fit, X)
    assert_raises(ValueError,
                  IsolationForest(max_samples=0.0).fit, X)
    assert_raises(ValueError,
                  IsolationForest(max_samples=2.0).fit, X)
    # The dataset has less than 256 samples, explicitly setting max_samples > n_samples
    # should result in a warning. If not set explicitly there should be no warning
    assert_warns_message(UserWarning,
                         "max_samples will be set to n_samples for estimation",
                         IsolationForest(max_samples=1000).fit, X)
    assert_no_warnings(IsolationForest(max_samples='auto').fit, X)
    assert_raises(ValueError,
                  IsolationForest(max_samples='foobar').fit, X)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_iforest_performance():
    """Test Isolation Forest performs well"""

    # Generate train/test data
    rng = check_random_state(2)
    X = 0.3 * rng.randn(120, 2)
    X_train = np.r_[X + 2, X - 2]
    X_train = X[:100]

    # Generate some abnormal novel observations
    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
    X_test = np.r_[X[100:], X_outliers]
    y_test = np.array([0] * 20 + [1] * 20)

    # fit the model
    clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)

    # predict scores (the lower, the more normal)
    y_pred = clf.predict(X_test)

    # check that there is at most 6 errors (false positive or false negative)
    assert_greater(roc_auc_score(y_test, y_pred), 0.98)
项目:monasca-analytics    作者:openstack    | 项目源码 | 文件源码
def __init__(self, _id, _config):
        super(IsolationForest, self).__init__(_id, _config)
        self._nb_samples = int(_config['nb_samples'])
项目:monasca-analytics    作者:openstack    | 项目源码 | 文件源码
def get_default_config():
        return {
            'module': IsolationForest.__name__,
            'nb_samples': N_SAMPLES
        }
项目:monasca-analytics    作者:openstack    | 项目源码 | 文件源码
def _get_best_detector(self, train):
        detector = ensemble.IsolationForest()
        detector.fit(train)
        return detector
项目:monasca-analytics    作者:openstack    | 项目源码 | 文件源码
def setUp(self):
        super(TestIsolationForest, self).setUp()
        self.if_sml = isolation_forest.IsolationForest(
            "fakeid", {"module": "fake", "nb_samples": 1000})
项目:monasca-analytics    作者:openstack    | 项目源码 | 文件源码
def test_learn_structure(self):
        data = self.get_testing_data()
        clf = self.if_sml.learn_structure(data)
        self.assertIsInstance(clf, ensemble.IsolationForest)
项目:yttresearch-machine-learning-algorithms-analysis    作者:gdemos01    | 项目源码 | 文件源码
def exportPresentationData(classifier,action):
        dir = input('Give Data Directory: ')

        if int(classifier)==1:
                clf = GradientBoostingClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 2:
                clf = LogisticRegression()
                classify(dir,clf,action)
        elif int(classifier) == 3:
                clf = KNeighborsClassifier(n_neighbors=5)
                classify(dir,clf,action)
        elif int(classifier) == 4:
                clf = DecisionTreeClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 5:
                clf = svm.LinearSVC()
                classify_type2(dir,clf,action)
        elif int(classifier) == 6:
                clf = RandomForestClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 7:
                clf = ExtraTreesClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 8:
                clf = IsolationForest()
                classify_type2(dir,clf,action)
        elif int(classifier) == 9:
                clf = AdaBoostClassifier(n_estimators=100)
                classify(dir,clf,action)
        elif int(classifier) == 10:
                clf = BaggingClassifier(DecisionTreeClassifier())
                classify(dir,clf,action)
        elif int(classifier) == 11:
                clf1 = GradientBoostingClassifier()
                clf2 = AdaBoostClassifier()
                clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft')
                classify(dir,clf,action)
项目:yttresearch-machine-learning-algorithms-analysis    作者:gdemos01    | 项目源码 | 文件源码
def exportPresentationData(classifier,action,dir):

        if int(classifier)==1:
                clf = GradientBoostingClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 2:
                clf = LogisticRegression()
                classify(dir,clf,action)
        elif int(classifier) == 3:
                clf = KNeighborsClassifier(n_neighbors=5)
                classify(dir,clf,action)
        elif int(classifier) == 4:
                clf = DecisionTreeClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 5:
                clf = svm.LinearSVC()
                classify_type2(dir,clf,action)
        elif int(classifier) == 6:
                clf = RandomForestClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 7:
                clf = ExtraTreesClassifier()
                classify(dir,clf,action)
        elif int(classifier) == 8:
                clf = IsolationForest()
                classify_type2(dir,clf,action)
        elif int(classifier) == 9:
                clf = AdaBoostClassifier(n_estimators=100)
                classify(dir,clf,action)
        elif int(classifier) == 10:
                clf = BaggingClassifier(DecisionTreeClassifier())
                classify(dir,clf,action)
        elif int(classifier) == 11:
                clf1 = GradientBoostingClassifier()
                clf2 = AdaBoostClassifier()
                clf = VotingClassifier(estimators=[('abdt', clf1), ('gbdt', clf2)], voting='soft')
                classify(dir,clf,action)
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def init(idlist,d,dblack,winsize=50):

    data = getdata()
    loadvalue(data, d,dblack)
    outvalue = extract(d,idlist)
    print len(outvalue)
    reshapevalue = np.array(outvalue).reshape(1,-1) 
    window =  DataFrame(reshapevalue)
    buf = []#################
    while True:
        print "fetching at %s" %ctime()
        data = getdata()
        loadvalue(data, d,dblack)
        outvalue = extract(d,idlist)
        reshapevalue = np.array(outvalue).reshape(1,-1) 
        window = window.append(DataFrame(reshapevalue))#??dataframe???1row * xcolums
        buf.append(DataFrame(reshapevalue))
        print len(window)
        if len(window) > winsize:
            break
        sleep(5)
    ilf = IsolationForest(n_estimators=100,verbose=2,)
    ilf.fit(window)
    print ilf.predict(window)
    print "__________________"
    for i in buf:
        print ilf.predict(i)
    return ilf,window
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def updateWindow(buf, cont):
    ilf = IsolationForest(n_estimators=100, contamination=cont)
    ilf.fit(buf)  # ??buf??????
    print "isolation update finished"
    return ilf
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def init(l_sys, l_namenode, l_FS, l_RPC, d, dwhite, winsize=200, sleeptime=15, cont=0.01):
    # ????
    win_sys = []
    win_namenode = []
    win_FS = []
    win_RPC = []
    while True:
        print "fetching at %s" % ctime()
        data = getdata()
        loadvalue(data, d, dwhite)
        o_sys, o_namenode, o_FS, o_RPC = extract(d, l_sys, l_namenode, l_FS, l_RPC)
        # ??????????
        win_sys.append(o_sys)
        win_namenode.append(o_namenode)
        win_FS.append(o_FS)
        win_RPC.append(o_RPC)

        if len(win_sys) > winsize:  # ????????????
            break
        sleep(sleeptime)
    # ?????
    ilf_sys = IsolationForest(n_estimators=100, contamination=cont)
    ilf_namenode = IsolationForest(n_estimators=100, contamination=cont)
    ilf_FS = IsolationForest(n_estimators=100, contamination=cont)
    ilf_RPC = IsolationForest(n_estimators=100, contamination=cont)

    # ??fit
    ilf_sys.fit(win_sys)
    ilf_namenode.fit(win_namenode)
    ilf_FS.fit(win_FS)
    ilf_RPC.fit(win_RPC)

    print ilf_sys.predict(win_sys)
    print ilf_namenode.predict(win_namenode)
    print ilf_FS.predict(win_FS)
    print ilf_RPC.predict(win_RPC)

    # ??????????????
    return ilf_sys, ilf_namenode, ilf_FS, ilf_RPC
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def updateWindow(buf,cont):

    ilf = IsolationForest(n_estimators=100,contamination=cont)
    ilf.fit(buf)#??buf??????
    print "isolation update finished"
    return ilf
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def init(l_sys,l_namenode,l_FS,l_RPC,l_queue,d,dwhite,winsize=200,sleeptime = 15,cont=0.01):
    #????
    win_sys =  []
    win_namenode = []
    win_FS = []
    win_RPC =[]
    win_queue = []

    while True:
        print "fetching at %s" %ctime()
        data = getdata()
        loadvalue(data, d,dwhite)
        o_sys,o_namenode,o_FS,o_RPC,o_queue  = extract(d,l_sys,l_namenode,l_FS,l_RPC,l_queue)
        #??????????
        win_sys.append(o_sys)
        win_namenode.append(o_namenode)
        win_FS.append(o_FS)
        win_RPC.append(o_RPC)
        win_queue.append(o_queue)

        if len(win_sys) > winsize:#????????????
            break
        sleep(sleeptime)
    #?????
    ilf_sys = IsolationForest(n_estimators=100,contamination=cont)
    ilf_namenode = IsolationForest(n_estimators=100,contamination=cont)
    ilf_FS = IsolationForest(n_estimators=100,contamination=cont)
    ilf_RPC = IsolationForest(n_estimators=100,contamination=cont)
    ilf_queue = IsolationForest(n_estimators=100,contamination=cont)
    #??fit
    ilf_sys.fit(win_sys)
    ilf_namenode.fit(win_namenode)
    ilf_FS.fit(win_FS)
    ilf_RPC.fit(win_RPC)
    ilf_queue.fit(win_queue)
    #??????????????
    return ilf_sys,ilf_namenode,ilf_FS,ilf_queue,ilf_RPC
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def updateWindow(buf,cont):

    ilf = IsolationForest(n_estimators=100,contamination=cont)
    ilf.fit(buf)#??buf??????
    print "isolation update finished"
    return ilf
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def updateWindow(l_sys, l_namenode, l_FS, l_RPC,cont,limit):
    ilf = IsolationForest(n_estimators=100, contamination=cont)
    client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb')
    #???
    data_sys = sampleWithDecay(client,limit,'select * from ganglia where w_system >0 ORDER BY time DESC limit 1500')#????limit????????
    d_sys = data_sys[l_sys]

    data_fs = sampleWithDecay(client, limit, 'select * from ganglia where w_fs >0 ORDER BY time DESC limit 1500')
    d_FS = data_fs[l_FS]

    data_namenode = sampleWithDecay(client, limit, 'select * from ganglia where w_namenode >0 ORDER BY time DESC limit 1500')
    d_namenode = data_namenode[l_namenode]

    data_rpc = sampleWithDecay(client, limit, 'select * from ganglia where w_rpc >0 ORDER BY time DESC limit 1500')
    d_RPC = data_rpc[l_RPC]

    ilf_sys = IsolationForest(n_estimators=100, contamination=cont)
    ilf_namenode = IsolationForest(n_estimators=100, contamination=cont)
    ilf_FS = IsolationForest(n_estimators=100, contamination=cont)
    ilf_RPC = IsolationForest(n_estimators=100, contamination=cont)
    #?????????
    ilf_sys.fit(d_sys)
    ilf_namenode.fit(d_namenode)
    ilf_FS.fit(d_FS)
    ilf_RPC.fit(d_RPC)

    print "update finished"
    return ilf_sys,ilf_namenode,ilf_FS,ilf_RPC
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def updateWindow(l_sys, l_namenode, l_FS, l_RPC,cont,limit):
    ilf = IsolationForest(n_estimators=100, contamination=cont)
    client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb')
    #???
    data_sys = sampleWithDecay(client,limit,'select * from ganglia where w_system >0 ORDER BY time DESC')
    d_sys = data_sys[l_sys]

    data_fs = sampleWithDecay(client, limit, 'select * from ganglia where w_fs >0 ORDER BY time DESC')
    d_FS = data_fs[l_FS]

    data_namenode = sampleWithDecay(client, limit, 'select * from ganglia where w_namenode >0 ORDER BY time DESC')
    d_namenode = data_namenode[l_namenode]

    data_rpc = sampleWithDecay(client, limit, 'select * from ganglia where w_rpc >0 ORDER BY time DESC')
    d_RPC = data_rpc[l_RPC]

    ilf_sys = IsolationForest(n_estimators=100, contamination=cont)
    ilf_namenode = IsolationForest(n_estimators=100, contamination=cont)
    ilf_FS = IsolationForest(n_estimators=100, contamination=cont)
    ilf_RPC = IsolationForest(n_estimators=100, contamination=cont)
    #?????????
    ilf_sys.fit(d_sys)
    ilf_namenode.fit(d_namenode)
    ilf_FS.fit(d_FS)
    ilf_RPC.fit(d_RPC)

    print "update finished"
    return ilf_sys,ilf_namenode,ilf_FS,ilf_RPC
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def init(l_sys, l_namenode, l_FS, l_RPC, sleeptime=15, cont=0.01,limit = 300):
    # ?????
    ilf_sys = IsolationForest(n_estimators=100, contamination=cont)
    ilf_namenode = IsolationForest(n_estimators=100, contamination=cont)
    ilf_FS = IsolationForest(n_estimators=50, contamination=cont)
    ilf_RPC = IsolationForest(n_estimators=100, contamination=cont)
    #??????????
    client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb')

    data_sys = sampleWithDecay(client, limit, 'select * from ganglia where w_system >0 ORDER BY time DESC')
    d_sys = data_sys[l_sys]

    data_fs = sampleWithDecay(client, limit, 'select * from ganglia where w_fs >0 ORDER BY time DESC')
    d_FS = data_fs[l_FS]

    data_namenode = sampleWithDecay(client, limit, 'select * from ganglia where w_namenode >0 ORDER BY time DESC')
    d_namenode = data_namenode[l_namenode]

    data_rpc = sampleWithDecay(client, limit, 'select * from ganglia where w_rpc >0 ORDER BY time DESC')
    d_RPC = data_rpc[l_RPC]


    print len(d_sys)
    print len(d_FS)
    print len(d_namenode)
    print len(d_RPC)
    # ??fit
    ilf_sys.fit(d_sys)
    ilf_namenode.fit(d_namenode)
    ilf_FS.fit(d_FS)
    ilf_RPC.fit(d_RPC)

    print ilf_FS.predict(d_FS)

    return ilf_sys, ilf_namenode, ilf_FS, ilf_RPC
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def updateWindow(buf,cont):

    ilf = IsolationForest(n_estimators=100,contamination=cont)
    ilf.fit(buf)#??buf??????
    print "isolation update finished"
    return ilf
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_iforest():
    """Check Isolation Forest for various parameter settings."""
    X_train = np.array([[0, 1], [1, 2]])
    X_test = np.array([[2, 1], [1, 1]])

    grid = ParameterGrid({"n_estimators": [3],
                          "max_samples": [0.5, 1.0, 3],
                          "bootstrap": [True, False]})

    with ignore_warnings():
        for params in grid:
            IsolationForest(random_state=rng,
                            **params).fit(X_train).predict(X_test)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_iforest_sparse():
    """Check IForest for various parameter settings on sparse input."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    grid = ParameterGrid({"max_samples": [0.5, 1.0],
                          "bootstrap": [True, False]})

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        for params in grid:
            # Trained on sparse format
            sparse_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train_sparse)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train)
            dense_results = dense_classifier.predict(X_test)

            assert_array_equal(sparse_results, dense_results)
            assert_array_equal(sparse_results, dense_results)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_recalculate_max_depth():
    """Check that max_depth is recalculated when max_samples is reset to n_samples"""
    X = iris.data
    clf = IsolationForest().fit(X)
    for est in clf.estimators_:
        assert_equal(est.max_depth, int(np.ceil(np.log2(X.shape[0]))))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_max_samples_attribute():
    X = iris.data
    clf = IsolationForest().fit(X)
    assert_equal(clf.max_samples_, X.shape[0])

    clf = IsolationForest(max_samples=500)
    assert_warns_message(UserWarning,
                         "max_samples will be set to n_samples for estimation",
                         clf.fit, X)
    assert_equal(clf.max_samples_, X.shape[0])

    clf = IsolationForest(max_samples=0.4).fit(X)
    assert_equal(clf.max_samples_, 0.4*X.shape[0])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_iforest_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]

    # Test LOF
    clf = IsolationForest(random_state=rng)
    clf.fit(X)
    pred = clf.predict(X)

    # assert detect outliers:
    assert_greater(np.min(pred[-2:]), np.max(pred[:-2]))
项目:dmon-adp    作者:igabriel85    | 项目源码 | 文件源码
def isolationForest(self, settings, mname, data):
        '''
        :param settings: -> settings dictionary
        :param mname: -> name of serialized cluster
        :return: -> isolation forest instance
        :example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False,
                        max_features:1.0, n_jobs:1, random_state:None, verbose:0}
        '''
        # rng = np.random.RandomState(42)
        if settings['random_state'] == 'None':
            settings['random_state'] = None

        if isinstance(settings['bootstrap'], str):
            settings['bootstrap'] = str2Bool(settings['bootstrap'])

        if isinstance(settings['verbose'], str):
            settings['verbose'] = str2Bool(settings['verbose'])

        if settings['max_samples'] != 'auto':
            settings['max_samples'] = int(settings['max_samples'])
        # print type(settings['max_samples'])
        for k, v in settings.iteritems():
            logger.info('[%s] : [INFO] IsolationForest %s set to %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)
            print "IsolationForest %s set to %s" % (k, v)
        try:
            clf = IsolationForest(n_estimators=int(settings['n_estimators']), max_samples=settings['max_samples'], contamination=float(settings['contamination']), bootstrap=settings['bootstrap'],
                        max_features=float(settings['max_features']), n_jobs=int(settings['n_jobs']), random_state=settings['random_state'], verbose=settings['verbose'])
        except Exception as inst:
            logger.error('[%s] : [ERROR] Cannot instanciate isolation forest with %s and %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Error while  instanciating isolation forest with %s and %s" % (type(inst), inst.args)
            sys.exit(1)
        # clf = IsolationForest(max_samples=100, random_state=rng)
        # print "*&*&*&& %s" % type(data)
        try:
            clf.fit(data)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Cannot fit isolation forest model with %s and %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            sys.exit(1)
        predict = clf.predict(data)
        print "Anomaly Array:"
        print predict
        self.__serializemodel(clf, 'isoforest', mname)
        return clf
项目:dmon-adp    作者:igabriel85    | 项目源码 | 文件源码
def detect(self, method, model, data):
        '''
        :param method: -> method name
        :param model: -> trained clusterer
        :param data: -> dataframe with data
        :return: -> dictionary that contains the list of anomalous timestamps
        '''
        smodel = self.__loadClusterModel(method, model)
        anomalieslist = []
        if not smodel:
            dpredict = 0
        else:
            if data.shape[0]:
                if isinstance(smodel, IsolationForest):
                    print "Detected IsolationForest model"
                    print "Contamination -> %s" % smodel.contamination
                    print "Max_Features -> %s" % smodel.max_features
                    print "Max_Samples -> %s" % smodel.max_samples_
                    print "Threashold -> %s " % smodel.threshold_
                    try:
                        dpredict = smodel.predict(data)
                        print "IsolationForest Prediction Array -> %s" %str(dpredict)
                    except Exception as inst:
                        logger.error('[%s] : [ERROR] Error while fitting isolationforest model to event with %s and %s',
                             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
                        dpredict = 0

                elif isinstance(smodel, DBSCAN):
                    print "Detected DBSCAN model"
                    print "Leaf_zise -> %s" % smodel.leaf_size
                    print "Algorithm -> %s" % smodel.algorithm
                    print "EPS -> %s" % smodel.eps
                    print "Min_Samples -> %s" % smodel.min_samples
                    print "N_jobs -> %s" % smodel.n_jobs
                    try:
                        dpredict = smodel.fit_predict(data)
                    except Exception as inst:
                        logger.error('[%s] : [ERROR] Error while fitting sDBSCAN model to event with %s and %s',
                                     datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                                     inst.args)
                        dpredict = 0
            else:
                dpredict = 0
                logger.warning('[%s] : [WARN] Dataframe empty with shape (%s,%s)',
                             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape[0]),
                             str(data.shape[1]))
                print "Empty dataframe received with shape (%s,%s)" % (str(data.shape[0]),
                             str(data.shape[1]))
            print "dpredict type is %s" % (type(dpredict))
            if type(dpredict) is not int:
                anomalyarray = np.argwhere(dpredict == -1)
                for an in anomalyarray:
                    anomalies = {}
                    anomalies['utc'] = int(data.iloc[an[0]]['key'])
                    anomalies['hutc'] = ut2hum(int(data.iloc[an[0]]['key']))
                    anomalieslist.append(anomalies)
        anomaliesDict = {}
        anomaliesDict['anomalies'] = anomalieslist
        logger.info('[%s] : [INFO] Detected anomalies with model %s using method %s are -> %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), model, method, str(anomaliesDict))
        return anomaliesDict
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def online_detect():

    df = pd.read_csv('ganglia.csv')
    #???
    maxContainSize = 500
    window = df[1000:]
    ilf = IsolationForest(n_estimators=100,verbose=2,)
    ilf.fit(window)
    print ilf.predict(window)
    analomyNum = 0
    allanalomy = 0

    outcome = []
    lable = []
    k = 3#????

    d = {}
    buf = []
    idlist,namelist = loadname()
    savename(namelist,idlist)
    print "initial finished"
    counter = 1
    while True:
        print "fetching at %s" %ctime()
        data = getdata()
        loadvalue(data, d)
        outvalue = extract(d,idlist)
        reshapevalue = np.array(outvalue).reshape(1,-1)
        predictValue = ilf.predict(reshapevalue)
        print "predict:",predictValue

        a = int(predictValue)
        outcome.append(a)
        lable.append(a)
        buf.append(DataFrame(reshapevalue))#??dataframe???1row * xcolums

        if a == -1:
            analomyNum += 1
            allanalomy += 1


        #????????
        if warn(buf,lable,k):
            lable[-1] = 1 #????????????
            analyseWarn(buf,outcome,k,namelist)#???????
            updateWindow(window, buf, maxContainSize)

        if detectUpdate(buf, 0.87, maxContainSize, analomyNum):#0.087
            del ilf
            window,ilf = updateWindow(window, buf, maxContainSize)
            analomyNum = 0
            del buf        
            buf = []


        counter += 1
        if counter %5000 ==0:
            break
        sleep(15)
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def init(l_sys, l_namenode, l_FS, l_RPC, d, dwhite, winsize=200, sleeptime=15, cont=0.01,limit = 300):
    win_sys = []
    win_namenode = []
    win_FS = []
    win_RPC = []
    while True:
        print "fetching at %s" % ctime()
        data = getdata()
        loadvalue(data, d, dwhite)
        o_sys, o_namenode, o_FS, o_RPC = extract(d, l_sys, l_namenode, l_FS, l_RPC)
        # ??????????
        win_sys.append(o_sys)
        win_namenode.append(o_namenode)
        win_FS.append(o_FS)
        win_RPC.append(o_RPC)
        if len(win_sys) > winsize:  # ????????????
            break
        sleep(sleeptime)
    # ?????
    ilf_sys = IsolationForest(n_estimators=100, contamination=cont)
    ilf_namenode = IsolationForest(n_estimators=100, contamination=cont)
    ilf_FS = IsolationForest(n_estimators=100, contamination=cont)
    ilf_RPC = IsolationForest(n_estimators=100, contamination=cont)
    #??????????
    client = DataFrameClient(host='127.0.0.1', port=8086, username='root', password='root', database='testdb')

    data_sys = sampleWithDecay(client, limit, 'select * from ganglia where w_system >0 ORDER BY time DESC limit 1500')#??????
    d_sys = data_sys[l_sys]

    data_fs = sampleWithDecay(client, limit, 'select * from ganglia where w_fs >0 ORDER BY time DESC limit 1500')
    d_FS = data_fs[l_FS]

    data_namenode = sampleWithDecay(client, limit, 'select * from ganglia where w_namenode >0 ORDER BY time DESC limit 1500')
    d_namenode = data_namenode[l_namenode]

    data_rpc = sampleWithDecay(client, limit, 'select * from ganglia where w_rpc >0 ORDER BY time DESC limit 1500')
    d_RPC = data_rpc[l_RPC]

    #????????
    append_sys = pd.DataFrame(win_sys,columns=l_sys)
    append_namenode = pd.DataFrame(win_namenode, columns=l_namenode)
    append_FS = pd.DataFrame(win_FS, columns=l_FS)
    append_RPC = pd.DataFrame(win_RPC, columns=l_RPC)

    out_sys = pd.concat([d_sys,append_sys])
    out_namenode = pd.concat([d_namenode,append_namenode])
    out_FS = pd.concat([d_FS,append_FS])
    out_RPC = pd.concat([d_RPC,append_RPC])
    # ??fit
    ilf_sys.fit(out_sys)
    ilf_namenode.fit(out_namenode)
    ilf_FS.fit(out_FS)
    ilf_RPC.fit(out_RPC)

    print ilf_sys.predict(win_sys)
    print ilf_namenode.predict(win_namenode)
    print ilf_FS.predict(win_FS)
    print ilf_RPC.predict(win_RPC)

    return ilf_sys, ilf_namenode, ilf_FS, ilf_RPC