pythondataframe多条件筛选_pandasdataframe多条件过滤⼀、多条件过滤
使⽤query⽅法
df_filtered = df.query('a == 4 & b != 2')
注意:等于过滤,是两个==;
使⽤==
data[(data['A']==0)&(data['B']==1)]
使⽤loc函数
>>> data.loc[(data['A']==0)&(data['B']==1)] # 提取data数据(多个筛选条件)
A B C D
a 0 1 2 3
⼆、范围过滤
/
如何备课
/ query函数
// query <
rpt.query('60000 < STK_ID < 70000')
// query in
rpt.query('STK_ID in (600809,600141,600329)')
// isin函数
// 筛选出dataframe中有某⼀个或某⼏个字符串的列:
list=['key1','key2']
df = df[df['one'].isin(list)]
// data[(data['A'].isin([0]))&(data['B'].isin([1]))] # isin函数
// 筛选出dataframe中不含某⼀个或某⼏个字符串的列,相当于反选
df = df[~df['one'].isin(list)]
三、有级联关系的过滤,⽐如20201101有两个advertir_id(adv1044525491840、adv1049003362112),20201102有⼀个(adv1049003362112),直接通过not in &实现不了,如下
// 预期结果如下
advertir_id day id
0 adv1044525491840 20201101 1
1 adv104900336211
2 20201101 2
>>> import pandas as pd
>>> data1 = {'id':[1,2,3],'day':[20201101,20201101,20201102],'advertir_id':
['adv1044525491840','adv1049003362112','adv1049003362112']}缩写的方法
>>> patchDF = pd.DataFrame(data1)
>>> data2 = {'day':[20201102],'advertir_id':['adv1049003362112']}
>>> advertirDF = pd.DataFrame(data2)
>>> adDF = patchDF.query("day not in (%s) & advertir_id not in (%s)"%
(advertirDF['day'].tolist(),advertirDF['advertir_id'].tolist()))
>>> adDF
advertir_id day id
0 adv1044525491840 20201101 1
// 实际返回如上
() not in ((),())写法,这种写法不⽀持,如下
>>> import pandas as pd
>>> data1 = {'id':[1,2,3],'day':[20201101,20201101,20201102],'advertir_id':
terminus
大胡子爷爷讲故事教英语['adv1044525491840','adv1049003362112','adv1049003362112']}
>>> patchDF = pd.DataFrame(data1)
>>> data2 = {'day':[20201102],'advertir_id':['adv1049003362112']}
>>> advertirDF = pd.DataFrame(data2)
>>> adDF = patchDF.query("(day,advertir_id) not in ((20201102,'adv1049003362112'))") Traceback (most recent call last):
File "", line 1, in
File "/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py", line 2850, in query
new_data = lf.loc[res]
File "/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.py", line 1478, in __getitem__ return lf._getitem_axis(maybe_callable, axis=axis)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.py", line 1912, in _getitem_axis retu
rn lf._get_label(key, axis=axis)
smokinggun
professorFile "/usr/local/lib/python2.7/dist-packages/pandas/core/indexing.py", line 140, in _get_label return lf.obj._xs(label, axis=axis)
File "/usr/local/lib/python2.7/dist-packages/pandas/core/generic.py", line 2987, in xs
loc = _loc(key)奢侈怎么读
File "/usr/local/lib/python2.7/dist-packages/pandas/core/indexes/ba.py", line 3080, in get_loc return lf.__loc(lf._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 140, in pandas._libs._loc
File "pandas/_libs/index.pyx", line 159, in pandas._libs._loc
File "pandas/_libs/index_class_helper.pxi", line 120, in pandas._libs.index.Int64Engine._check_type KeyError: True
left join⽅式
>>> import pandas as pd
>>> import numpy as np
>>> data1 = {'id':[1,2,3],'day':[20201101,20201101,20201102],'advertir_id':
['adv1044525491840','adv1049003362112','adv1049003362112']}
因为你不喜欢我>>> patchDF = pd.DataFrame(data1)
>>> data2 = {'day':[20201102],'advertir_id':['adv1049003362112'],'id':[1]}
英语六级试卷>>> advertirDF = pd.DataFrame(data2)
>>> mergeDF = pd.merge(patchDF, advertirDF, how='left', on=['day', 'advertir_id'],suffixes=('_patch', '_advertir')) >>> adDF = mergeDF[np.isnan(mergeDF['id_advertir'])]
>>> adDF
advertir_id day id_patch id_advertir
0 adv1044525491840 20201101 1 NaN
1 adv104900336211
2 20201101 2 NaN
// 求⾮nan,⼤数据量可能报错itdiandi/view/2874
// >>> adDF = mergeDF[~np.isnan(mergeDF['id_advertir'])]
奋斗英文// >>> adDF
// advertir_id day id_patch id_advertir
// 2 adv1049003362112 20201102 3 1.0
// 使⽤pd.notna判断
adDF = a(mergeDF['id_advertir'])]