python学习笔记之sklearn数据预处理归⼀化可参考帮助理解!
标准归⼀化
归⼀化到均值为0,⽅差为1
sklearn.preprocessing.scale函数:Standardize a datat along any axis
先贴出主要的源码,乍⼀看,很乱,其实细看之下,就是多了⼀些判断稀疏矩阵之类的条件性代码。
#coding=utf-8
import numpy as np
from scipy import spar
def_handle_zeros_in_scale(scale, copy=True):
''' Makes sure that whenever scale is zero, we handle it correctly.
This happens in most scalers when we have constant features.'''
# if we are fitting on 1D arrays, scale might be a scalar
if np.isscalar(scale):
if scale ==.0:
scale =1.
return scale
elif isinstance(scale, np.ndarray):
if copy:
# New array to avoid side-effects
scale = py()
scale[scale ==0.0]=1.0
return scale
def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
"""Standardize a datat along any axis
Center to the mean and component wi scale to unit variance.
Read more in the :ref:`Ur Guide <preprocessing_scaler>`.
Parameters
----------
X : {array-like, spar matrix}
The data to center and scale.
axis : int (0 by default)
axis ud to compute the means and standard deviations along. If 0,
independently standardize each feature, otherwi (if 1) standardize
each sample.
with_mean : boolean, True by default
If True, center the data before scaling.
with_std : boolean, True by default
If True, scale the data to unit variance (or equivalently,
unit standard deviation).
copy : boolean, optional, default True
t to Fal to perform inplace row normalization and avoid a
近亲为什么不能结婚copy (if the input is already a numpy array or a scipy.spar
刘组词语CSC matrix and if axis is 1).
Notes
-
----
This implementation will refu to center scipy.spar matrices
since it would make them non-spar and would potentially crash the
program with memory exhaustion problems.
Instead the caller is expected to either t explicitly
`with_mean=Fal` (in that ca, only variance scaling will be
performed on the features of the CSC matrix) or to call `X.toarray()`
if he/she expects the materialized den array to fit in memory.
To avoid memory copy the caller should pass a CSC matrix.
See also
--------
-
-------
StandardScaler: Performs scaling to unit variance using the``Transformer`` API
(e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`).面对困难英语
"""# noqa
X = check_array(X, accept_spar='csc', copy=copy, ensure_2d=Fal,
warn_on_dtype=True, estimator='the scale function',
dtype=FLOAT_DTYPES)
if spar.isspar(X):
if with_mean:
rai ValueError(
"Cannot center spar matrices: pass `with_mean=Fal` instead"
特意的近义词
" See docstring for motivation and alternatives.")
if axis !=0:
rai ValueError("Can only scale spar matrix on axis=0, "
" got axis=%d"% axis)
if with_std:
_, var = mean_variance_axis(X, axis=0)
var = _handle_zeros_in_scale(var, copy=Fal)
inplace_column_scale(X,1/ np.sqrt(var))
el:
X = np.asarray(X)
if with_mean:
mean_ = np.mean(X, axis)
if with_std:
scale_ = np.std(X, axis)
# Xr is a view on the original array broadcasting on the axis in which we are interested in
#下⾯这⼀⾏⼀开始着实让⼈不太懂,感觉是⼀直对Xr操作,怎么突然返回X,后来才知道Xr是X的⼀个视图,#np.rollaxis返回的是输⼊数组的视图,两者只是形式上不同,本质是相等的,通过asrt(X==Xr)可以证实。 Xr = np.rollaxis(X, axis)
if with_mean:
Xr -= mean_
mean_1 = Xr.mean(axis=0)
# Verify that mean_1 is 'clo to zero'. If X contains very
# large values, mean_1 can also be very large, due to a lack of
# precision of mean_. In this ca, a pre-scaling of the
# concerned feature is efficient, for instance by its mean or
# maximum.
if not np.allclo(mean_1,0):
warnings.warn("Numerical issues were encountered "
"when centering the data "
"and might not be solved. Datat may "
"contain too large values. You may need "
"to prescale your features.")
Xr -= mean_1
if with_std:
scale_ = _handle_zeros_in_scale(scale_, copy=Fal)
Xr /= scale_
if with_mean:
mean_2 = Xr.mean(axis=0)
# If mean_2 is not 'clo to zero', it comes from the fact that
# scale_ is very small so that mean_2 = mean_1/scale_ > 0, even
# if mean_1 was clo to zero. The problem is thus esntially
# due to the lack of precision of mean_. A solution is then to
# subtract the mean again:
if not np.allclo(mean_2,0):
warnings.warn("Numerical issues were encountered "
"when scaling the data "
"and might not be solved. The standard "
"deviation of the data is probably "
"very clo to 0. ")
Xr -= mean_2
return X
简化版scale代码
def scale_mean_var(input_arr,axis=0):
#from sklearn import preprocessing
#input_arr= preprocessing.scale(input_arr.astype('float'))
mean_ = np.mean(input_arr,axis=0)
scale_ = np.std(input_arr,axis=0)滩浒岛
#减均值
output_arr= input_arr- mean_
#判断均值是否接近0
mean_1 = an(axis=0)
if not np.allclo(mean_1,0):
output_arr -= mean_1
#将标准差为0元素的置1
#scale_ = _handle_zeros_in_scale(scale_, copy=Fal)
scale_[scale_ ==0.0]=1.0
#除以标准差党员个人工作总结
output_arr /=scale_
#再次判断均值是否为0
mean_2 = output_arr .mean(axis=0)
if not np.allclo(mean_2,0):
output_arr -= mean_2
return output_arr
最⼤最⼩归⼀化
sklearn.preprocessing.minmax_scale函数:Transforms features by scaling each feature to a given range.
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_scaled = X_std * (max - min) + min
简化版代码很简单
def max_min(input_arr,o_min,o_max):
"""
Transforms features by scaling each feature to a given range.
"""
粗茶淡饭下一句是什么
i_min = np.min(input_arr)
i_max = np.max(input_arr)
out_arr = np.clip(input_arr,i_min,i_max)
out_arr =(out_arr- i_min)/(i_max - i_min)
if o_max==1and o_min==0:
return out_arr
el:
埃迪斯科文大学
out_arr = out_arr*(o_max-o_min)+o_min
return out_arr
最⼤绝对值归⼀化
maxabs_scale函数:Scale each feature by its maximum absolute value.
def maxabs_scale(input,axis=0):
"""
Scale each feature to the [-1, 1] range without breaking the sparsity
"""
if not isinstance(input,numpy.ndarray):
input= np.asarray(input).astype(np.float32)
maxabs = np.max(abs(input),axis=0)
out_array =input/maxabs
return out_array