NLP实战-商品信息可视化与文本分析

更新时间:2023-05-16 02:49:09 阅读: 评论:0

NLP实战-商品信息可视化与⽂本分析
⽂章⽬录
商品信息可视化与⽂本处理结果可视化展⽰
启动 notebook
jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
import nltk
import string
import re
import numpy as np
import pandas as pd
import pickle
#import lda
import matplotlib.pyplot as plt
import aborn as sns
sns.t()
from nltk.stem.porter import*
kenize import word_tokenize, nt_tokenize
pus import stopwords
from sklearn.feature_extraction import stop_words
from collections import Counter
from wordcloud import WordCloud
from sklearn. import TfidfVectorizer
from sklearn. import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import plotly.offline as py  # plotly 画图和交互
py.init_notebook_mode(connected=True)
aph_objs as go
ls as tls
%matplotlib inline
import bokeh.plotting as bp
dels import HoverTool, BoxSelectTool
dels import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook
#ansform import factor_cmap
import warnings
warnings.filterwarnings('ignore')
import logging
train = pd.read_csv('train.tsv', p='\t')
test = pd.read_csv('test.tsv', p='\t')
# size of training and datat
print(train.shape)# (1482535, 8)
print(test.shape)# (693359, 7)
建安风骨# different data types in the datat: categorical (strings) and numeric
train.dtypes
'''
train_id              int64
name                  object
item_condition_id      int64
category_name        object
brand_name            object
price                float64
shipping              int64
item_description      object
dtype: object
'''
train.head()
train_id name item_condition_id category_name brand_name price shipping item_description
00MLB Cincinnati Reds T Shirt
Size XL
3Men/Tops/T-shirts NaN10.01No description yet
11Razer BlackWidow Chroma
Keyboard
3
Electronics/Computers &
Tablets/Components & P...
Razer52.00
金贸大厦
This keyboard is in great
condition and works ...
22AVA-VIV Blou1Women/Tops & Blous/Blou Target10.01Adorable top with a hint of lace and a
33Leather Hor Statues1Home/Home Décor/Home Décor
NaN35.01
New with tags. Leather
Accents hors. Retail for [rm]...
4424K GOLD plated ro1Women/Jewelry/Necklaces NaN44.00Complete with certificate of authenticity
train_id name item_condition_id category_name brand_name price shipping item_description
对我们将要提供的建议价格进⾏处理,使⽤log变换
train.price.describe()
count    1.482535e+06
mean    2.673752e+01
std      3.858607e+01
min      0.000000e+00
25%      1.000000e+01
50%      1.700000e+01
75%      2.900000e+01
max      2.009000e+03
Name: price, dtype: float64
价格属性转换前和转换后的分布情况对⽐
plt.subplot(1,2,1)
(train['price']).plot.hist(bins=50, figsize=(20,10), edgecolor='white',range=[0,250])
plt.xlabel('price+', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Price Distribution - Training Set', fontsize=17)
plt.subplot(1,2,2)
np.log(train['price']+1).plot.hist(bins=50, figsize=(20,10), edgecolor='white')
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Log(Price) Distribution - Training Set', fontsize=17)
plt.show()
运费承担:⼤概有55%的卖家是承担运费的
train.shipping.value_counts()/len(train)
0    0.552726
1    0.447274
Name: shipping, dtype: float64
运费不同情况的价格变化如懿传资源
prc_shipBySeller = train.loc[train.shipping==1,'price']
prc_shipByBuyer = train.loc[train.shipping==0,'price']
fig, ax = plt.subplots(figsize=(20,10))
ax.hist(np.log(prc_shipBySeller+1), color='#8CB4E1', alpha=1.0, bins=50,
label='Price when Seller pays Shipping')
ax.hist(np.log(prc_shipByBuyer+1), color='#007D00', alpha=0.7, bins=50,
锦鲤的价格
label='Price when Buyer pays Shipping')
ax.t(title='Histogram Comparison', ylabel='% of Datat in Bin')
plt.legend()
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.title('Price Distribution by Shipping Type', fontsize=17)
plt.tick_params(labelsize=15)
plt.show()
⽤户⾃⼰付费的平均价格要低于商家包邮的
商品类别
print("There are %d unique values in the category column."% train['category_name'].nunique())
There are 1287 unique values in the category column.
# TOP 5 RAW CATEGORIES
train['category_name'].value_counts()[:5]
Women/Athletic Apparel/Pants, Tights, Leggings    60177
Women/Tops & Blous/T-Shirts                    46380
Beauty/Makeup/Face                                34335
Beauty/Makeup/Lips                                29910
Electronics/Video Games & Consoles/Games          26557
Name: category_name, dtype: int64
# missing categories
print("There are %d items that do not have a label."% train['category_name'].isnull().sum())
There are 6327 items that do not have a label.
类别细分
def split_cat(text):
try:return text.split("/")
except:return("No Label","No Label","No Label")
train['general_cat'], train['subcat_1'], train['subcat_2']= \
zip(*train['category_name'].apply(lambda x: split_cat(x)))
train.head()
train_id name item_condition_id category_name brand_name price shipping item_description general_cat subcat_1subcat_2
00
MLB
Cincinnati
Reds T
Shirt Size
XL
3Men/Tops/T-shirts NaN10.01No description yet Men Tops T-shirts
11
Razer
BlackWidow
3
Electronics/Computers &
Razer52.00
This keyboard is
in great condition Electronics
Computers Components
Chroma
Keyboard
Tablets/Components & P...and works ...& Tablets& Parts
22AVA-VIV
Blou
1
Women/Tops &
Blous/Blou
Target10.01
Adorable top with
a hint of lace and
a
Women
Tops &
Blous
圣诞歌英文Blou
33Leather
Hor
Statues
1
Home/Home Décor/Home
Décor Accents
NaN35.01
New with tags.
Leather hors.
Retail for [rm]...
Home
Home
Décor
Home Décor
Accents
4424K GOLD
plated ro
1Women/Jewelry/Necklaces NaN44.00
Complete with
certificate of
戴宗authenticity
Women Jewelry Necklaces
train_id name item_condition_id category_name brand_name price shipping item_description general_cat subcat_1subcat_2
# repeat the same step for the test t
test['general_cat'], test['subcat_1'], test['subcat_2']= \
zip(*test['category_name'].apply(lambda x: split_cat(x)))
print("There are %d unique first sub-categories."% train['subcat_1'].nunique())
There are 114 unique first sub-categories.
print("There are %d unique cond sub-categories."% train['subcat_2'].nunique())
There are 871 unique cond sub-categories.
总的来说,我们有7个主要类别(第⼀个⼦类别中的114个和第⼆个⼦类别中的871个):⼥性和美容项⽬是最受欢迎的两类(超过50%的观察),其次是⼉童和电⼦产品。
各⼤主类别分布情况
x = train['general_cat'].value_counts().index.values.astype('str')
y = train['general_cat'].value_counts().values
pct =[("%.2f"%(v*100))+"%"for v in(y/len(train))]
#import plotly.offline as py
#py.init_notebook_mode(connected=True)
#aph_objs as go
#ls as tls
trace1 = go.Bar(x=x, y=y, text=pct)
layout =dict(title='Number of Items by Main Category',
yaxis =dict(title='Count'),
xaxis =dict(title='Category'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)
subcat_1类别分布情况
x = train['subcat_1'].value_counts().index.values.astype('str')[:15]
y = train['subcat_1'].value_counts().values[:15]
pct =[("%.2f"%(v*100))+"%"for v in(y/len(train))][:15]
trace1 = go.Bar(x=x, y=y, text=pct,
marker=dict(
color = y,colorscale='Portland',showscale=True,
reverscale =Fal
))
layout =dict(title='Number of Items by Sub Category (Top 15)',
yaxis =dict(title='Count'),
xaxis =dict(title='SubCategory'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)
# 运动服装化妆⼥衬衫
general_cats = train['general_cat'].unique()
x =[train.loc[train['general_cat']==cat,'price']for cat in general_cats]
data =[go.Box(x=np.log(x[i]+1), name=general_cats[i])for i in range(len(general_cats))]
layout =dict(title="Price Distribution by General Category",
yaxis =dict(title='Frequency'),
xaxis =dict(title='Category'))
fig =dict(data=data, layout=layout)
py.iplot(fig)
品牌名字
print("There are %d unique brand names in the training datat."% train['brand_name'].nunique())
There are 4809 unique brand names in the training datat.
x = train['brand_name'].value_counts().index.values.astype('str')[:10]
y = train['brand_name'].value_counts().values[:10]
trace1 = go.Bar(x=x, y=y,
marker=dict(
color = y,colorscale='Portland',showscale=True,
reverscale =Fal
))
layout =dict(title='Top 10 Brand by Number of Items',
yaxis =dict(title='Brand Name'),
xaxis =dict(title='Count'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)
商品描述
由于它是⾮结构化数据,因此解析这个特定项⽬会更具挑战性。 这是否意味着更详细和更长的描述会导致更⾼的出价? 我们将删除所有标点,删除⼀些英语停⽤词(即冗余
词,如“a”,“the”等)以及长度⼩于3的任何其他词:
def wordCount(text):
# convert to lower ca and strip regex
try:
# convert to lower ca and strip regex
text = text.lower()
regex = re.compile('['+re.escape(string.punctuation)+'0-9\\r\\t\\n]')
txt = regex.sub(" ", text)
# tokenize
# words = nltk.word_tokenize(clean_txt)
# remove words in stop words
words =[w for w in txt.split(" ") \
if not w in stop_words.ENGLISH_STOP_WORDS and len(w)>3]
return len(words)
except:
return0
# add a column of word counts to both the training and test t
train['desc_len']= train['item_description'].apply(lambda x: wordCount(x))
test['desc_len']= test['item_description'].apply(lambda x: wordCount(x))
train.head()
train_id name item_condition_id category_name brand_name price shipping item_description general_cat subcat_1subcat_2desc_len
00
MLB
Cincinnati
Reds T
Shirt Size
XL
木兰花张先
3Men/Tops/T-shirts NaN10.01No description yet Men Tops T-shirts
11
Razer
BlackWidow
Chroma三国演义手抄报图片大全
Keyboard
3
Electronics/Computers &
Tablets/Components & P...
Razer52.00
This keyboard is
in great condition
and works ...
Electronics
Computers
& Tablets
Components
& Parts
22AVA-VIV
Blou
1
Women/Tops &
Blous/Blou
Target10.01
Adorable top with
a hint of lace and
a
Women
Tops &
Blous
Blou
33Leather
Hor
Statues
1
Home/Home Décor/Home
Décor Accents
NaN35.01
New with tags.
Leather hors.
Retail for [rm]...
Home
Home
Décor
Home Décor
Accents
4424K GOLD
plated ro
1Women/Jewelry/Necklaces NaN44.00
Complete with
certificate of
authenticity
Women Jewelry Necklaces
df = upby('desc_len')['price'].mean().ret_index()
名字长短与价格有关吗
trace1 = go.Scatter(
x = df['desc_len'],
y = np.log(df['price']+1),
mode ='lines+markers',
name ='lines+markers'
)
layout =dict(title='Average Log(Price) by Description Length',
yaxis =dict(title='Average Log(Price)'),
xaxis =dict(title='Description Length'))
fig=dict(data=[trace1], layout=layout)
py.iplot(fig)
3.4671136297685132, 3.454598158389271, 3.4523128824675857, 3.740839193044647, 3.264486336120253, 3.3081069585961433,
2.909932283750658,
3.1179499062782403, 2.833213344056216, 2.515678308454754, 3.2217288938506075, 2.8526314299133175,
2.8716796248840124,
3.713572066704308, 2.7343675094195836, 3.056356895370426, 3.2121868367174042, 2.8791984572980396,
2.9444389791664403], “mode”: “lines+markers”, “name”: “lines+markers”}], {“title”: “Average Log(Price) by Description Length”, “yaxis”: {“title”: “Average Log(Price)”}, “xaxis”: {“title”: “Description Length”}}, {“showLink”: true, “linkText”: “Export to plot.ly”})});

本文发布于:2023-05-16 02:49:09,感谢您对本站的认可!

本文链接:https://www.wtabcd.cn/fanwen/fan/82/648612.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:价格   情况   可视化   类别   转换   超过   运费
相关文章
留言与评论(共有 0 条评论)
   
验证码:
推荐文章
排行榜
Copyright ©2019-2022 Comsenz Inc.Powered by © 专利检索| 网站地图