首页 > 美文鉴赏

阅读源码系列：sklearn中train_test_split使用及源码学习

更新时间:2023-07-18 08:22:21 阅读：评论：0

阅读源码系列：sklearn中train_test_split使⽤及源码学习

引⾔

最近⽤到了sklearn中的train_test_split，感觉很好⽤，想来实现也不是特别复杂，于是想着看⼀下源码，学习⼀下⼤佬的写法使⽤⽅法

>>>import numpy as np

>>>del_lection import train_test_split

>>> X, y = np.arange(10).reshape((5,2)),range(5)

>>> X自做多情

array([[0,1],

[2,3],

[4,5],

[6,7],

[8,9]])

>>>list(y)

[0,1,2,3,4]

>>> X_train, X_test, y_train, y_test = train_test_split(

... X, y, test_size=0.33, random_state=42)

...

>>> X_train

array([[4,5],

[0,1],

[6,7]])

>>> y_train

[2,0,3]

>>> X_test

array([[2,3],

[8,9]])

>>> y_test

[1,4]

>>> train_test_split(y, shuffle=Fal)

[[0,1,2],[3,4]]

学习到东西

作者在进⾏每⼀个操作前，都确保所得到的数据是想要的，尤其是类型判断，所以说写⼀个合格的接⼝，绝⾮易事

合理的异常捕捉和错误提⽰

⼀个函数尽量只做⼀件事，以及函数的命名都很讲究，清楚明了

迭代器的相关⽤法，⽐如chain相关模块需要进⼀步了解

源码剥离

从sklearn中剥离出train_test_split的相关代码

主要如下（可以简单运⾏）：

from itertools import chain, compress

import numpy as np

import scipy.spar as sp

from math import ceil, floor

import numbers

def check_random_state(ed):

"""Turn ed into a np.random.RandomState instance

开国少将排名

Parameters

----------

ed : None, int or instance of RandomState

If ed is None, return the RandomState singleton ud by np.random.

If ed is an int, return a new RandomState instance eded with ed.

If ed is already a RandomState instance, return it.

Otherwi rai ValueError.

"""

if ed is None or ed is np.random:

return and._rand

if isinstance(ed, numbers.Integral):

return np.random.RandomState(ed)

if isinstance(ed, np.random.RandomState):

return ed

rai ValueError('%r cannot be ud to ed a numpy.random.RandomState' ' instance'% ed)

class BaShuffleSplit():

"""Ba class for ShuffleSplit and StratifiedShuffleSplit"""

def__init__(lf, n_splits=10,*, test_size=None, train_size=None,

random_state=None):

lf.n_splits = n_splits

lf.random_state = random_state

lf._default_test_size =0.1

def split(lf, X, y=None, groups=None):

"""Generate indices to split data into training and test t.

Parameters

----------

X : array-like of shape (n_samples, n_features)

Training data, where n_samples is the number of samples

and n_features is the number of features.

y : array-like of shape (n_samples,)

The target variable for supervid learning problems.

groups : array-like of shape (n_samples,), default=None

Group labels for the samples ud while splitting the datat into

train/test t.

Yields

------

train : ndarray

The training t indices for that split.

test : ndarray

The testing t indices for that split.

Notes

-----

i等于q除以t

Randomized CV splitters may return different results for each call of

split. You can make the results identical by tting `random_state`

to an integer.

"""

X, y, groups = indexable(X, y, groups)

for train, test in lf._iter_indices(X, y, groups):

yield train, test

def_iter_indices(lf, X, y=None, groups=None):

"""Generate (train, test) indices"""

def get_n_splits(lf, X=None, y=None, groups=None):

"""Returns the number of splitting iterations in the cross-validator

Parameters

----------

X : object

Always ignored, exists for compatibility.

y : object

Always ignored, exists for compatibility.

groups : object

Always ignored, exists for compatibility.

Returns

阿胶含片-

------

n_splits : int

Returns the number of splitting iterations in the cross-validator.

"""

return lf.n_splits

def__repr__(lf):

return _build_repr(lf)

class ShuffleSplit(BaShuffleSplit):

"""Random permutation cross-validator

Yields indices to split data into training and test ts.

Note: contrary to other cross-validation strategies, random splits

do not guarantee that all folds will be different, although this is

still very likely for sizeable datats.

Read more in the :ref:`Ur Guide <cross_validation>`.

Parameters

----------

n_splits : int, default=10

Number of re-shuffling & splitting iterations.

test_size : float or int, default=None

If float, should be between 0.0 and 1.0 and reprent the proportion

of the datat to include in the test split. If int, reprents the

absolute number of test samples. If None, the value is t to the

complement of the train size. If ``train_size`` is also None, it will

be t to 0.1.

train_size : float or int, default=None

If float, should be between 0.0 and 1.0 and reprent the

proportion of the datat to include in the train split. If

int, reprents the absolute number of train samples. If None,

the value is automatically t to the complement of the test size.

random_state : int, RandomState instance or None, default=None

Controls the randomness of the training and testing indices produced.

Pass an int for reproducible output across multiple function calls.

See :term:`Glossary <random_state>`.

Examples

--------

# >>> import numpy as np

# >>> del_lection import ShuffleSplit

# >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]])

# >>> y = np.array([1, 2, 1, 2, 1, 2])

# >>> rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)

# >>> rs.get_n_splits(X)

# 5

# >>> print(rs)

# ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None) # >>> for train_index, test_index in rs.split(X):

# ... print("TRAIN:", train_index, "TEST:", test_index)

# TRAIN: [1 3 0 4] TEST: [5 2]

# TRAIN: [4 0 2 5] TEST: [1 3]

# TRAIN: [1 2 4 0] TEST: [3 5]

# TRAIN: [3 4 1 0] TEST: [5 2]

# TRAIN: [3 5 1 0] TEST: [2 4]

# >>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25,

# ... random_state=0)

# >>> for train_index, test_index in rs.split(X):

# ... print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [1 3 0] TEST: [5 2]

TRAIN: [4 0 2] TEST: [1 3]

TRAIN: [1 2 4] TEST: [3 5]

速度提升TRAIN: [3 4 1] TEST: [5 2]

TRAIN: [3 4 1] TEST: [5 2]

TRAIN: [3 5 1] TEST: [2 4]

"""

def__init__(lf, n_splits=10,*, test_size=None, train_size=None, random_state=None):

super().__init__(

n_splits=n_splits,

test_size=test_size,

train_size=train_size,

random_state=random_state)

lf._default_test_size =0.1

def_iter_indices(lf, X, y=None, groups=None):

n_samples = _num_samples(X)

n_train, n_test = _validate_shuffle_split(

n_samples, lf.test_size, lf.train_size,

default_test_size=lf._default_test_size)

rng = check_random_state(lf.random_state)

for i in range(lf.n_splits):

# random partition

permutation = rng.permutation(n_samples)

ind_test = permutation[:n_test]

ind_train = permutation[n_test:(n_test + n_train)]

yield ind_train, ind_test

def_num_samples(x):

"""Return number of samples in array-like x."""

message ='Expected quence or array-like, got %s'%type(x)

if hasattr(x,'fit')and callable(x.fit):

# Don't get num_samples from an enmbles length!

rai TypeError(message)

if not hasattr(x,'__len__')and not hasattr(x,'shape'):

if hasattr(x,'__array__'):

x = np.asarray(x)

el:

rai TypeError(message)

if hasattr(x,'shape')and x.shape is not None:

if len(x.shape)==0:

rai TypeError("Singleton array %r cannot be considered"

" a valid collection."% x)

# Check that shape is returning an integer or default to len

# Dask dataframes may not return numeric shape[0] value

if isinstance(x.shape[0], numbers.Integral):

return x.shape[0]

灯饰品牌

try:

return len(x)

except TypeError as type_error:

rai TypeError(message)from type_error

def check_consistent_length(*arrays):

"""Check that all arrays have consistent first dimensions.

Checks whether all objects in arrays have the same shape or length. Parameters

----------

*arrays : list or tuple of input objects.

Objects that will be checked for consistent length.

"""

lengths =[_num_samples(X)for X in arrays if X is not None]

查找资料uniques = np.unique(lengths)

uniques = np.unique(lengths)

if len(uniques)>1:

rai ValueError("Found input variables with inconsistent numbers of"

" samples: %r"%[int(l)for l in lengths])

def_make_indexable(iterable):

"""Ensure iterable supports indexing or convert to an indexable variant.

Convert spar matrices to csr and other non-indexable iterable to arrays.

Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged. Parameters

----------

iterable : {list, dataframe, ndarray, spar matrix} or None

Object to be converted to an indexable iterable.

"""

if sp.isspar(iterable):

sr()

elif hasattr(iterable,"__getitem__")or hasattr(iterable,"iloc"):

return iterable

elif iterable is None:

return iterable

return np.array(iterable)

def indexable(*iterables):

"""Make arrays indexable for cross-validation.

Checks consistent length, pass through None, and ensures that everything can be indexed by converting spar matrices to csr and converting

non-interable objects to arrays.

Parameters

----------

*iterables : {lists, dataframes, ndarrays, spar matrices}

List of objects to ensure sliceability.

"""

result =[_make_indexable(X)for X in iterables]

check_consistent_length(*result)

return result

def_validate_shuffle_split(n_samples, test_size, train_size,

default_test_size=None):

"""

Validation helper to check if the test/test sizes are meaningful wrt to the

size of the data (n_samples)

"""

if test_size is None and train_size is None:

test_size = default_test_size

# 获得数据类型

test_size_type = np.asarray(test_size).dtype.kind

train_size_type = np.asarray(train_size).dtype.kind

if(test_size_type =='i'and(test_size >= n_samples or test_size <=0) or test_size_type =='f'and(test_size <=0or test_size >=1)):

rai ValueError('test_size={0} should be either positive and smaller'

' than the number of samples {1} or a float in the '

'(0, 1) range'.format(test_size, n_samples))

if(train_size_type =='i'and(train_size >= n_samples or train_size <=0) or train_size_type =='f'and(train_size <=0or train_size >=1)):

rai ValueError('train_size={0} should be either positive and smaller'

' than the number of samples {1} or a float in the '

'(0, 1) range'.format(train_size, n_samples))

快速瘦身方法if train_size is not None and train_size_type not in('i','f'):

本文发布于:2023-07-18 08:22:21，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/89/1086113.html

上一篇：一种基于TCM-SVDD的样本类别标注方法-论文

下一篇：几何布朗运动

标签：源码相关捕捉命名需要学习

留言与评论（共有 0 条评论）