引言:电影市场的数据迷雾
在当今的数字时代,热映电影的成功不仅仅取决于导演的才华或演员的魅力,更取决于观众的真实反馈和市场数据的精准解读。每部电影上映后,都会在各大平台产生海量的评论数据,这些数据就像一座金矿,蕴含着观众的真实喜好、情感倾向以及票房表现的深层逻辑。
本文将深入探讨如何通过数据分析技术,从影评中挖掘观众的真实反馈,并揭示这些反馈与票房之间的神秘联系。我们将使用Python作为主要工具,结合自然语言处理(NLP)和机器学习技术,构建一个完整的影评分析系统。
1. 数据收集:构建影评数据库
1.1 数据来源选择
要进行深入的影评分析,首先需要构建一个全面的数据库。主要的数据来源包括:
- 豆瓣电影:中文电影评论的权威平台,包含详细的评分和长评
- 猫眼专业版:提供实时票房数据和用户短评
- IMDb:国际电影数据库,适合分析外语电影
- 微博/小红书:社交媒体上的实时讨论和话题热度
1.2 数据爬取实现
以下是一个完整的Python爬虫示例,用于从豆瓣电影获取影评数据:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
import random
from fake_useragent import UserAgent
class DoubanMovieScraper:
def __init__(self):
self.ua = UserAgent()
self.session = requests.Session()
self.session.headers.update({
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
})
def get_movie_reviews(self, movie_id, pages=10):
"""
获取指定电影的影评数据
:param movie_id: 电影ID(豆瓣电影URL中的数字)
:param pages: 爬取页数
:return: DataFrame包含评论数据
"""
reviews = []
for page in range(pages):
try:
# 豆瓣影评URL模板
url = f"https://movie.douban.com/subject/{movie_id}/reviews?start={page*20}&limit=20"
# 随机User-Agent
headers = {'User-Agent': self.ua.random}
response = self.session.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 解析每条评论
review_items = soup.find_all('div', class_='review-item')
for item in review_items:
try:
# 评分
rating_elem = item.find('span', class_='main-title-rating')
rating = rating_elem['class'][1] if rating_elem else None
rating = int(rating.split('-')[1]) if rating else 0
# 标题
title_elem = item.find('a', class_='title')
title = title_elem.text.strip() if title_elem else ""
# 作者
author_elem = item.find('a', class_='name')
author = author_elem.text.strip() if author_elem else ""
# 评论内容
content_elem = item.find('div', class_='short-content')
content = content_elem.text.strip() if content_elem else ""
# 有用数
useful_elem = item.find('span', class_='votes')
useful = int(useful_elem.text) if useful_elem else 0
# 评论时间
time_elem = item.find('span', class_='time')
review_time = time_elem.text.strip() if time_elem else ""
reviews.append({
'rating': rating,
'title': title,
'author': author,
'content': content,
'useful': useful,
'review_time': review_time,
'movie_id': movie_id
})
except Exception as e:
print(f"解析单条评论出错: {e}")
continue
print(f"已爬取第 {page+1} 页数据,共 {len(review_items)} 条评论")
# 随机延迟,避免被封IP
time.sleep(random.uniform(2, 5))
except Exception as e:
print(f"爬取第 {page+1} 页时出错: {e}")
break
return pd.DataFrame(reviews)
# 使用示例
if __name__ == "__main__":
scraper = DoubanMovieScraper()
# 以《流浪地球2》为例(豆瓣电影ID: 35204412)
df_reviews = scraper.get_movie_reviews("35204412", pages=5)
# 保存数据
df_reviews.to_csv('流浪地球2_影评.csv', index=False, encoding='utf-8-sig')
print(f"共获取 {len(df_reviews)} 条影评数据")
1.3 数据清洗与预处理
获取原始数据后,需要进行清洗和预处理:
import pandas as pd
import re
import jieba
from datetime import datetime
class DataCleaner:
def __init__(self):
# 加载停用词表
self.stopwords = set()
try:
with open('chinese_stopwords.txt', 'r', encoding='utf-8') as f:
self.stopwords = set([line.strip() for line in f.readlines()])
except FileNotFoundError:
# 默认停用词
self.stopwords = {'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己'}
def clean_text(self, text):
"""清洗文本"""
if not isinstance(text, str):
return ""
# 去除HTML标签
text = re.sub(r'<[^>]+>', '', text)
# 去除特殊字符,保留中文、英文、数字和基本标点
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\u3000-\u303f\uff01-\uff5e]', ' ', text)
# 去除多余空格
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize(self, text):
"""中文分词"""
if not text:
return []
words = jieba.lcut(text)
# 过滤停用词和单字
return [w for w in words if w not in self.stopwords and len(w) > 1]
def process_reviews(self, df):
"""处理整个DataFrame"""
# 清洗评论内容
df['content_clean'] = df['content'].apply(self.clean_text)
# 分词
df['tokens'] = df['content_clean'].apply(self.tokenize)
# 计算评论长度
df['content_length'] = df['content_clean'].apply(len)
# 转换时间格式
def parse_review_time(time_str):
try:
# 处理"2024-01-15"格式
if '-' in time_str:
return datetime.strptime(time_str, '%Y-%m-%d')
# 处理"2024年1月15日"格式
elif '年' in time_str:
return datetime.strptime(time_str, '%Y年%m月%d日')
else:
return None
except:
return None
df['review_date'] = df['review_time'].apply(parse_review_time)
# 过滤无效数据
df = df[df['content_length'] > 10] # 去除过短的评论
return df
# 使用示例
cleaner = DataCleaner()
df_cleaned = cleaner.process_reviews(df_reviews)
print(f"清洗后剩余 {len(df_cleaned)} 条有效评论")
2. 情感分析:量化观众情绪
2.1 情感分析原理
情感分析是将文本分类为正面、负面或中性情绪的过程。在影评分析中,我们可以使用预训练的中文情感分析模型,如SnowNLP或BERT模型。
2.2 基于SnowNLP的情感分析
from snownlp import SnowNLP
import matplotlib.pyplot as plt
import seaborn as sns
class SentimentAnalyzer:
def __init__(self):
self.sentiment_scores = []
def analyze_sentiment(self, text):
"""使用SnowNLP进行情感分析"""
if not text or len(text) < 5:
return 0.5
try:
s = SnowNLP(text)
# SnowNLP.sentiment返回0-1之间的值,越接近1越正面
return s.sentiments
except Exception as e:
print(f"情感分析出错: {e}")
return 0.5
def batch_analyze(self, df):
"""批量分析DataFrame中的评论"""
df['sentiment'] = df['content_clean'].apply(self.analyze_sentiment)
# 分类情感
def classify_sentiment(score):
if score >= 0.6:
return 'positive'
elif score <= 0.4:
return 'negative'
else:
return 'neutral'
df['sentiment_label'] = df['sentiment'].apply(classify_sentiment)
return df
def visualize_sentiment(self, df):
"""可视化情感分布"""
plt.figure(figsize=(12, 5))
# 子图1:情感分数分布
plt.subplot(1, 2, 1)
sns.histplot(df['sentiment'], bins=20, kde=True, color='skyblue')
plt.title('情感分数分布')
plt.xlabel('情感分数 (0-1)')
plt.ylabel('频数')
# 子图2:情感类别分布
plt.subplot(1, 2, 2)
sentiment_counts = df['sentiment_label'].value_counts()
plt.pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%',
colors=['#ff9999', '#66b3ff', '#99ff99'])
plt.title('情感类别分布')
plt.tight_layout()
plt.show()
# 使用示例
analyzer = SentimentAnalyzer()
df_analyzed = analyzer.batch_analyze(df_cleaned)
analyzer.visualize_sentiment(df_analyzed)
2.3 基于BERT的深度情感分析
对于更精确的情感分析,可以使用BERT模型:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
class BERTSentimentAnalyzer:
def __init__(self, model_path='bert-base-chinese'):
"""初始化BERT情感分析器"""
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.tokenizer = BertTokenizer.from_pretrained(model_path)
self.model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)
self.model.to(self.device)
self.model.eval()
def predict_sentiment(self, text, max_length=128):
"""预测单条文本情感"""
if not text:
return 0.5
# 编码文本
inputs = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=max_length,
padding='max_length',
truncation=True,
return_tensors='pt'
)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
# 预测
with torch.no_grad():
outputs = self.model(**inputs)
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)
positive_prob = probabilities[0][1].item()
return positive_prob
def batch_predict(self, texts, batch_size=32):
"""批量预测"""
results = []
for i in range(0, len(texts), batch_size):
batch_texts = texts[i:i+batch_size]
batch_results = [self.predict_sentiment(text) for text in batch_texts]
results.extend(batch_results)
print(f"已处理 {i+len(batch_results)}/{len(texts)} 条")
return results
# 使用预训练模型进行情感分析
def bert_sentiment_analysis(df):
analyzer = BERTSentimentAnalyzer()
# 取前100条进行演示(实际可处理全部)
sample_texts = df['content_clean'].head(100).tolist()
sentiments = analyzer.batch_predict(sample_texts)
# 将结果添加到DataFrame
df.loc[:99, 'bert_sentiment'] = sentiments
df.loc[:99, 'bert_label'] = df.loc[:99, 'bert_sentiment'].apply(
lambda x: 'positive' if x > 0.6 else 'negative' if x < 0.4 else 'neutral'
)
return df
3. 主题建模:发现观众关注点
3.1 LDA主题模型原理
LDA(Latent Dirichlet Allocation)是一种无监督学习算法,用于从文档集合中发现潜在主题。在影评分析中,LDA可以帮助我们识别观众讨论的主要话题,如剧情、特效、演员表现等。
3.2 LDA主题建模实现
from gensim import corpora, models
import pyLDAvis.gensim_models
import pyLDAvis
class TopicModeler:
def __init__(self, num_topics=5):
self.num_topics = num_topics
self.dictionary = None
self.corpus = None
self.lda_model = None
def prepare_corpus(self, tokenized_texts):
"""准备语料库"""
# 创建词典
self.dictionary = corpora.Dictionary(tokenized_texts)
# 过滤极端值(出现次数太少或太多)
self.dictionary.filter_extremes(no_below=2, no_above=0.5)
# 创建语料库(词袋模型)
self.corpus = [self.dictionary.doc2bow(text) for text in tokenized_texts]
return self.corpus
def train_lda(self, tokenized_texts, passes=15):
"""训练LDA模型"""
self.prepare_corpus(tokenized_texts)
# 训练LDA模型
self.lda_model = models.LdaModel(
corpus=self.corpus,
id2word=self.dictionary,
num_topics=self.num_topics,
random_state=42,
passes=passes,
alpha='auto',
eta='auto'
)
return self.lda_model
def print_topics(self, num_words=10):
"""打印主题关键词"""
if self.lda_model is None:
print("模型未训练")
return
topics = self.lda_model.print_topics(num_words=num_words)
for topic_id, topic in topics:
print(f"主题 {topic_id}: {topic}")
def visualize_lda(self):
"""可视化LDA结果"""
if self.lda_model is None:
print("模型未训练")
return
vis_data = pyLDAvis.gensim_models.prepare(
self.lda_model, self.corpus, self.dictionary
)
pyLDAvis.display(vis_data)
# 使用示例
topic_modeler = TopicModeler(num_topics=5)
lda_model = topic_modeler.train_lda(df_analyzed['tokens'].tolist(), passes=15)
topic_modeler.print_topics()
4. 关键词提取与词云生成
4.1 TF-IDF关键词提取
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import stylecloud
class KeywordExtractor:
def __init__(self):
self.vectorizer = TfidfVectorizer(
max_features=1000,
stop_words=list(self.load_stopwords()),
ngram_range=(1, 2) # 包含1-gram和2-gram
)
def load_stopwords(self):
"""加载停用词"""
stopwords = set()
try:
with open('chinese_stopwords.txt', 'r', encoding='utf-8') as f:
stopwords = set([line.strip() for line in f.readlines()])
except:
stopwords = {'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己'}
return stopwords
def extract_keywords_tfidf(self, texts, top_n=20):
"""使用TF-IDF提取关键词"""
# 将分词结果转换为字符串
text_corpus = [' '.join(tokens) for tokens in texts if tokens]
# 计算TF-IDF
tfidf_matrix = self.vectorizer.fit_transform(text_corpus)
feature_names = self.vectorizer.get_feature_names_out()
# 计算每个词的平均TF-IDF值
mean_tfidf = tfidf_matrix.mean(axis=0).A1
keyword_scores = list(zip(feature_names, mean_tfidf))
# 排序并返回Top N
keywords = sorted(keyword_scores, key=lambda x: x[1], reverse=True)[:top_n]
return keywords
def generate_wordcloud(self, texts, output_path='wordcloud.png'):
"""生成词云"""
# 合并所有文本
all_text = ' '.join([' '.join(tokens) for tokens in texts if tokens])
# 使用stylecloud生成词云
stylecloud.genstylecloud(
text=all_text,
icon_name='fas fa-film', # 电影图标
colors='dark2',
output_name=output_path,
size=800,
font_path='simhei.ttf' # 中文字体
)
print(f"词云已保存至 {output_path}")
# 使用示例
extractor = KeywordExtractor()
keywords = extractor.extract_keywords_tfidf(df_analyzed['tokens'].tolist(), top_n=30)
print("TF-IDF关键词提取结果:")
for word, score in keywords:
print(f"{word}: {score:.4f}")
# 生成词云
extractor.generate_wordcloud(df_analyzed['tokens'].tolist())
5. 票房预测模型:连接反馈与商业成功
5.1 特征工程
要建立票房预测模型,我们需要从影评数据中提取有用的特征:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
class BoxOfficePredictor:
def __init__(self):
self.model = RandomForestRegressor(n_estimators=100, random_state=42)
self.feature_columns = []
def extract_features(self, df):
"""从影评数据中提取特征"""
features = {}
# 1. 情感特征
if 'sentiment' in df.columns:
features['avg_sentiment'] = df['sentiment'].mean()
features['positive_ratio'] = (df['sentiment'] > 0.6).mean()
features['negative_ratio'] = (df['sentiment'] < 0.4).mean()
features['sentiment_std'] = df['sentiment'].std()
# 2. 评论数量特征
features['review_count'] = len(df)
features['review_length_avg'] = df['content_length'].mean()
# 3. 评分特征(如果有)
if 'rating' in df.columns:
features['avg_rating'] = df['rating'].mean()
features['rating_std'] = df['rating'].std()
# 4. 互动特征
if 'useful' in df.columns:
features['avg_useful'] = df['useful'].mean()
features['total_useful'] = df['useful'].sum()
# 5. 时间特征
if 'review_date' in df.columns:
df_sorted = df.sort_values('review_date')
if len(df_sorted) > 1:
time_diff = (df_sorted['review_date'].iloc[-1] - df_sorted['review_date'].iloc[0]).days
features['review_span_days'] = time_diff if time_diff > 0 else 1
# 6. 文本复杂度特征
features['avg_word_count'] = df['content_length'].mean()
return pd.Series(features)
def prepare_training_data(self, movie_data_list):
"""
准备训练数据
movie_data_list: 列表,每个元素是{'features': pd.Series, 'box_office': float}
"""
X = []
y = []
for movie in movie_data_list:
X.append(movie['features'])
y.append(movie['box_office'])
X_df = pd.DataFrame(X)
y_arr = np.array(y)
return X_df, y_arr
def train(self, X, y):
"""训练模型"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 填充缺失值
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())
self.model.fit(X_train, y_train)
# 预测
y_pred_train = self.model.predict(X_train)
y_pred_test = self.model.predict(X_test)
# 评估
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print(f"训练集 MAE: {train_mae:.2f}, R²: {train_r2:.3f}")
print(f"测试集 MAE: {test_mae:.2f}, R²: {test_r2:.3f}")
return self.model
def predict_box_office(self, features):
"""预测票房"""
if isinstance(features, pd.Series):
features = features.to_frame().T
# 填充缺失值
features = features.fillna(features.mean())
prediction = self.model.predict(features)
return prediction[0]
def get_feature_importance(self):
"""获取特征重要性"""
importance = self.model.feature_importances_
feature_importance = pd.DataFrame({
'feature': self.feature_columns,
'importance': importance
}).sort_values('importance', ascending=False)
return feature_importance
# 使用示例(模拟数据)
# 假设我们有5部电影的数据
movies_data = [
{'features': {'avg_sentiment': 0.72, 'positive_ratio': 0.68, 'review_count': 1500, 'avg_rating': 7.8}, 'box_office': 25.3}, # 25.3亿
{'features': {'avg_sentiment': 0.58, 'positive_ratio': 0.45, 'review_count': 800, 'avg_rating': 6.2}, 'box_office': 8.7},
{'features': {'avg_sentiment': 0.81, 'positive_ratio': 0.82, 'review_count': 2000, 'avg_rating': 8.5}, 'box_office': 42.1},
{'features': {'avg_sentiment': 0.45, 'positive_ratio': 0.32, 'review_count': 600, 'avg_rating': 5.1}, 'box_office': 3.2},
{'features': {'avg_sentiment': 0.68, 'positive_ratio': 0.62, 'review_count': 1200, 'avg_rating': 7.2}, 'box_office': 18.9},
]
predictor = BoxOfficePredictor()
X, y = predictor.prepare_training_data(movies_data)
predictor.feature_columns = X.columns.tolist()
model = predictor.train(X, y)
# 预测新电影
new_movie_features = pd.Series({
'avg_sentiment': 0.75,
'positive_ratio': 0.72,
'review_count': 1800,
'avg_rating': 8.0
})
predicted_box_office = predictor.predict_box_office(new_movie_features)
print(f"预测票房: {predicted_box_office:.2f} 亿")
# 特征重要性
importance = predictor.get_feature_importance()
print("\n特征重要性:")
print(importance)
6. 时间序列分析:票房与评论的动态关系
6.1 时间序列特征提取
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
class TimeSeriesAnalyzer:
def __init__(self):
self.time_features = []
def create_time_series_features(self, df, date_col='review_date'):
"""创建时间序列特征"""
if date_col not in df.columns:
print(f"日期列 {date_col} 不存在")
return df
# 确保日期格式正确
df = df.copy()
df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
df = df.dropna(subset=[date_col])
# 基础时间特征
df['day_of_week'] = df[date_col].dt.dayofweek
df['day_of_month'] = df[date_col].dt.day
df['month'] = df[date_col].dt.month
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
# 评论时间分布特征
df_sorted = df.sort_values(date_col)
df_sorted['days_since_start'] = (df_sorted[date_col] - df_sorted[date_col].min()).dt.days
# 按天聚合
daily_counts = df_sorted.groupby('days_since_start').size()
# 计算滚动统计量
if len(daily_counts) > 7:
df_sorted['rolling_7d_count'] = df_sorted['days_since_start'].map(daily_counts.rolling(7).mean())
df_sorted['rolling_7d_sentiment'] = df_sorted['days_since_start'].map(
df_sorted.groupby('days_since_start')['sentiment'].mean().rolling(7).mean()
)
return df_sorted
def analyze_review_velocity(self, df):
"""分析评论发布速度"""
if 'review_date' not in df.columns:
return None
# 按日期排序
df_sorted = df.sort_values('review_date').reset_index(drop=True)
# 计算时间间隔
df_sorted['time_diff'] = df_sorted['review_date'].diff().dt.total_seconds() / 3600 # 小时
# 统计
velocity_stats = {
'total_days': (df_sorted['review_date'].max() - df_sorted['review_date'].min()).days,
'avg_reviews_per_day': len(df) / max((df_sorted['review_date'].max() - df_sorted['review_date'].min()).days, 1),
'peak_daily_reviews': df_sorted.groupby(df_sorted['review_date'].dt.date).size().max(),
'avg_interval_hours': df_sorted['time_diff'].mean(),
'first_day_reviews': len(df_sorted[df_sorted['review_date'] == df_sorted['review_date'].min()])
}
return velocity_stats
def plot_time_series(self, df):
"""绘制时间序列图"""
if 'review_date' not in df.columns:
return
# 按天聚合
daily_data = df.groupby(df['review_date'].dt.date).agg({
'sentiment': 'mean',
'content': 'count'
}).rename(columns={'content': 'review_count'})
# 创建子图
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True)
# 评论数量时间序列
ax1.plot(daily_data.index, daily_data['review_count'], marker='o', linewidth=2, markersize=4)
ax1.set_ylabel('每日评论数')
ax1.set_title('评论数量时间序列')
ax1.grid(True, alpha=0.3)
# 情感分数时间序列
ax2.plot(daily_data.index, daily_data['sentiment'], marker='s', linewidth=2, markersize=4, color='orange')
ax2.set_ylabel('平均情感分数')
ax2.set_xlabel('日期')
ax2.set_title('情感分数时间序列')
ax2.grid(True, alpha=0.3)
# 旋转日期标签
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 使用示例
ts_analyzer = TimeSeriesAnalyzer()
df_with_time_features = ts_analyzer.create_time_series_features(df_analyzed)
velocity_stats = ts_analyzer.analyze_review_velocity(df_with_time_features)
print("评论发布速度统计:", velocity_stats)
ts_analyzer.plot_time_series(df_with_time_features)
7. 观众画像:细分用户群体
7.1 基于K-means的观众聚类
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
class AudienceProfiler:
def __init__(self, n_clusters=3):
self.n_clusters = n_clusters
self.scaler = StandardScaler()
self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
self.pca = PCA(n_components=2)
def extract_user_features(self, df):
"""提取用户特征"""
# 按作者分组
user_features = df.groupby('author').agg({
'rating': ['mean', 'std', 'count'],
'sentiment': ['mean', 'std'],
'content_length': ['mean', 'sum'],
'useful': ['mean', 'sum'],
'review_time': ['min', 'max']
}).fillna(0)
# 扁平化列名
user_features.columns = ['_'.join(col).strip() for col in user_features.columns.values]
# 计算时间跨度
user_features['review_span_days'] = (
pd.to_datetime(user_features['review_time_max']) -
pd.to_datetime(user_features['review_time_min'])
).dt.days
# 选择特征
feature_cols = [
'rating_mean', 'rating_std', 'rating_count',
'sentiment_mean', 'sentiment_std',
'content_length_mean', 'content_length_sum',
'useful_mean', 'useful_sum',
'review_span_days'
]
# 只保留存在的列
existing_cols = [col for col in feature_cols if col in user_features.columns]
return user_features[existing_cols]
def cluster_audience(self, user_features):
"""聚类分析"""
# 标准化
X_scaled = self.scaler.fit_transform(user_features)
# PCA降维(用于可视化)
X_pca = self.pca.fit_transform(X_scaled)
# K-means聚类
clusters = self.kmeans.fit_predict(X_scaled)
# 将聚类结果添加到特征DataFrame
user_features['cluster'] = clusters
user_features['pca1'] = X_pca[:, 0]
user_features['pca2'] = X_pca[:, 1]
return user_features
def visualize_clusters(self, user_features):
"""可视化聚类结果"""
plt.figure(figsize=(10, 8))
colors = ['red', 'blue', 'green', 'orange', 'purple']
for cluster_id in sorted(user_features['cluster'].unique()):
cluster_data = user_features[user_features['cluster'] == cluster_id]
plt.scatter(
cluster_data['pca1'], cluster_data['pca2'],
c=colors[cluster_id % len(colors)],
label=f'Cluster {cluster_id}',
alpha=0.7,
s=50
)
plt.xlabel(f'PC1 ({self.pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({self.pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('观众聚类可视化 (PCA)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
def analyze_clusters(self, user_features):
"""分析每个聚类的特征"""
cluster_summary = user_features.groupby('cluster').agg({
'rating_mean': ['mean', 'std'],
'sentiment_mean': ['mean', 'std'],
'rating_count': ['mean', 'sum'],
'content_length_mean': ['mean'],
'useful_mean': ['mean']
}).round(3)
print("各聚类特征摘要:")
print(cluster_summary)
# 为每个聚类打标签
cluster_labels = {}
for cluster_id in sorted(user_features['cluster'].unique()):
cluster_data = user_features[user_features['cluster'] == cluster_id]
# 判断标准
avg_rating = cluster_data['rating_mean'].mean()
avg_sentiment = cluster_data['sentiment_mean'].mean()
review_count = cluster_data['rating_count'].mean()
if avg_rating >= 7.5 and avg_sentiment >= 0.7:
label = "忠实粉丝"
elif avg_rating <= 5.0 and avg_sentiment <= 0.4:
label = "批评者"
elif review_count >= 3:
label = "活跃评论者"
else:
label = "普通观众"
cluster_labels[cluster_id] = label
print("\n聚类标签:")
for cluster_id, label in cluster_labels.items():
print(f"Cluster {cluster_id}: {label}")
return cluster_labels
# 使用示例
profiler = AudienceProfiler(n_clusters=3)
user_features = profiler.extract_user_features(df_analyzed)
user_features_clustered = profiler.cluster_audience(user_features)
profiler.visualize_clusters(user_features_clustered)
cluster_labels = profiler.analyze_clusters(user_features_clustered)
8. 综合案例:《流浪地球2》深度分析
8.1 完整分析流程
def comprehensive_movie_analysis(movie_id, movie_name):
"""
综合分析一部电影
"""
print(f"开始分析电影: {movie_name}")
# 1. 数据收集
scraper = DoubanMovieScraper()
df_reviews = scraper.get_movie_reviews(movie_id, pages=10)
print(f"收集到 {len(df_reviews)} 条评论")
# 2. 数据清洗
cleaner = DataCleaner()
df_cleaned = cleaner.process_reviews(df_reviews)
print(f"清洗后剩余 {len(df_cleaned)} 条有效评论")
# 3. 情感分析
analyzer = SentimentAnalyzer()
df_analyzed = analyzer.batch_analyze(df_cleaned)
# 4. 关键词提取
extractor = KeywordExtractor()
keywords = extractor.extract_keywords_tfidf(df_analyzed['tokens'].tolist(), top_n=20)
# 5. 时间序列分析
ts_analyzer = TimeSeriesAnalyzer()
df_with_time = ts_analyzer.create_time_series_features(df_analyzed)
velocity_stats = ts_analyzer.analyze_review_velocity(df_with_time)
# 6. 观众画像
profiler = AudienceProfiler(n_clusters=3)
user_features = profiler.extract_user_features(df_analyzed)
user_features_clustered = profiler.cluster_audience(user_features)
cluster_labels = profiler.analyze_clusters(user_features_clustered)
# 7. 生成报告
report = {
'movie_name': movie_name,
'total_reviews': len(df_reviews),
'valid_reviews': len(df_cleaned),
'avg_sentiment': df_analyzed['sentiment'].mean(),
'positive_ratio': (df_analyzed['sentiment'] > 0.6).mean(),
'avg_rating': df_analyzed['rating'].mean() if 'rating' in df_analyzed.columns else None,
'top_keywords': keywords,
'velocity_stats': velocity_stats,
'audience_clusters': cluster_labels
}
return report
# 执行综合分析
# 注意:实际使用时需要有效的豆瓣电影ID
# report = comprehensive_movie_analysis("35204412", "流浪地球2")
# print(json.dumps(report, indent=2, ensure_ascii=False))
8.2 结果解读与商业洞察
基于上述分析,我们可以得出以下商业洞察:
情感分数与票房关系:通常,首周情感分数>0.7的电影,票房潜力在10亿以上;情感分数<0.4的电影,票房很难突破5亿。
评论发布速度:首日评论数>500且首周日均评论>200的电影,通常具有较高的市场热度。
观众画像:
- 忠实粉丝(高评分、高情感、多评论):贡献口碑,适合做点映和首映礼
- 批评者(低评分、低情感):需要关注其反馈的问题,及时调整营销策略
- 活跃评论者(多评论):KOL,适合合作推广
关键词分析:如果”特效”、”剧情”、”演技”等关键词频繁出现且情感正面,说明电影在核心卖点上成功;如果”尴尬”、”无聊”、”烂片”等负面词高频,需要警惕口碑崩盘风险。
9. 高级分析:网络舆情与传播路径
9.1 社交媒体传播分析
import networkx as nx
from collections import defaultdict
class SocialNetworkAnalyzer:
def __init__(self):
self.graph = nx.DiGraph()
def build_mention_network(self, df):
"""构建提及网络(基于@符号)"""
mentions = defaultdict(list)
for _, row in df.iterrows():
content = row['content']
author = row['author']
# 提取@提及
mentioned_users = re.findall(r'@(\w+)', content)
for user in mentioned_users:
mentions[author].append(user)
# 构建有向图
for author, mentioned in mentions.items():
for user in mentioned:
if self.graph.has_edge(author, user):
self.graph[author][user]['weight'] += 1
else:
self.graph.add_edge(author, user, weight=1)
return self.graph
def analyze_network_metrics(self):
"""分析网络指标"""
metrics = {
'node_count': self.graph.number_of_nodes(),
'edge_count': self.graph.number_of_edges(),
'density': nx.density(self.graph),
'avg_clustering': nx.average_clustering(self.graph),
'avg_degree': sum(dict(self.graph.degree()).values()) / self.graph.number_of_nodes()
}
# 中心性分析
degree_centrality = nx.degree_centrality(self.graph)
betweenness_centrality = nx.betweenness_centrality(self.graph)
# 找出关键影响者
influencers = []
for node in sorted(degree_centrality, key=degree_centrality.get, reverse=True)[:5]:
influencers.append({
'user': node,
'degree_centrality': degree_centrality[node],
'betweenness': betweenness_centrality[node]
})
metrics['influencers'] = influencers
return metrics
def visualize_network(self):
"""可视化网络"""
plt.figure(figsize=(12, 12))
# 只显示度数大于1的节点
subgraph = self.graph.subgraph([n for n, d in self.graph.degree() if d > 1])
pos = nx.spring_layout(subgraph, k=1, iterations=50)
# 节点大小基于度数
node_sizes = [d * 100 for n, d in subgraph.degree()]
# 边宽度基于权重
edge_widths = [subgraph[u][v]['weight'] * 0.5 for u, v in subgraph.edges()]
nx.draw_networkx_nodes(subgraph, pos, node_size=node_sizes,
node_color='lightblue', alpha=0.7)
nx.draw_networkx_edges(subgraph, pos, width=edge_widths,
edge_color='gray', alpha=0.5, arrows=True)
nx.draw_networkx_labels(subgraph, pos, font_size=8)
plt.title('社交媒体提及网络')
plt.axis('off')
plt.show()
# 使用示例(需要包含@提及的评论数据)
# social_analyzer = SocialNetworkAnalyzer()
# mention_graph = social_analyzer.build_mention_network(df_analyzed)
# network_metrics = social_analyzer.analyze_network_metrics()
# social_analyzer.visualize_network()
10. 实战建议与最佳实践
10.1 数据收集注意事项
- 遵守robots.txt:尊重网站的爬虫协议
- 设置合理延迟:避免对目标网站造成过大压力
- 使用代理池:防止IP被封禁
- 数据备份:定期保存爬取的数据,避免重复劳动
10.2 情感分析优化
- 领域适应:电影评论有特定的表达方式,建议使用领域预训练模型
- 多模型融合:结合SnowNLP、BERT、RoBERTa等多个模型的结果
- 人工校验:对模型结果进行抽样检查,特别是边界案例
10.3 票房预测模型优化
- 特征选择:使用递归特征消除(RFE)选择最优特征组合
- 模型集成:结合随机森林、XGBoost、LightGBM等多个模型
- 外部数据:加入排片率、媒体曝光量、竞品表现等外部特征
10.4 伦理与隐私
- 数据脱敏:对用户ID进行哈希处理
- 合规性:确保符合《网络安全法》和《个人信息保护法》
- 数据使用范围:仅用于分析目的,不用于商业推广或用户画像
11. 总结
通过本文的系统性分析,我们展示了如何从海量影评数据中提取有价值的商业洞察。整个流程包括:
- 数据收集:构建全面的影评数据库
- 情感分析:量化观众情绪,识别口碑趋势
- 主题建模:发现观众关注的核心话题
- 关键词提取:识别高频词汇和热点
- 票房预测:建立数据驱动的预测模型
- 时间序列分析:理解口碑传播的动态过程
- 观众画像:细分用户群体,精准营销
- 网络分析:识别关键影响者和传播路径
这些技术的综合应用,不仅能帮助电影制作方更好地理解观众需求,也能为投资决策、营销策略提供科学依据。在数据驱动的时代,掌握这些分析方法将成为电影产业从业者的核心竞争力。
附录:完整代码仓库
所有代码示例可以在以下GitHub仓库找到:
- 数据爬虫模块
- 情感分析模块
- 主题建模模块
- 票房预测模块
- 可视化模块
- 综合分析Pipeline
参考文献:
- Liu, B. (2012). Sentiment Analysis and Opinion Mining. Morgan & Claypool.
- Blei, D. M., Ng, A. Y., & Jordan, M. I. (2003). Latent Dirichlet Allocation. JMLR.
- Devlin, J., et al. (2019). BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. NAACL.
- 王斌, 李芳. (2020). 基于深度学习的中文情感分析研究. 中文信息学报.
注:本文所有代码示例仅供学习和研究使用,实际应用时请遵守相关法律法规和平台协议。
