推荐算法复习
用的是movielens的数据
数据地址:https://files.grouplens.org/datasets/movielens/ml-1m.zip
基于内容推荐
Content-based recommendation
核心思想:根据用户过去喜欢的内容,推荐与其相似的新内容
文本类通过TF-IDF提取特征向量
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# 基于内容推荐
# 核心
# 1.TF-IDF提取电影的特征向量
# 2.计算电影之间的相似度
# 3.推荐电影
ratings = pd.read_table('data/ratings.dat', sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')
movies = pd.read_table('data/movies.dat', sep='::', header=None, names=['movie_id', 'title', 'genres'], engine='python', encoding='ISO-8859-1')
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movies['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# 推荐函数
def recommend_movies(user, ratings, movies, cosine_sim, top_n=3):
# 获取用户评分
user_ratings = ratings[ratings['user_id'] == user]
# 获取用户评分的电影id
user_indices = user_ratings["movie_id"]
user_indices = user_indices.tolist()
# 计算用户评分的电影的平均相似度,比如用户评价了电影1,2,3,那么用户对电影1,2,3的相似度就是3个电影之间的相似度之和除以3
user_similarities = cosine_sim[user_indices].mean(axis=0)
# 获取该用户电影相似度列表并排序
movie_scores = list(enumerate(user_similarities))
movie_scores = sorted(movie_scores, key=lambda x: x[1], reverse=True)
print(movie_scores[:top_n])
# 获取推荐电影
recommended_movies = [movies['title'].iloc[i] for i, _ in movie_scores if i not in user_indices]
return recommended_movies[:top_n]
print(recommend_movies(1, ratings, movies, cosine_sim))
# 添加用户评分权重
def recommend_movies_with_weights(user, ratings, movies, cosine_sim, top_n=3):
# 获取用户评分数据
user_ratings = ratings[ratings['user_id'] == user]
# 合并用户评分和电影信息
user_data = user_ratings.merge(movies, on='movie_id')
# 用户看过的电影索引和对应的评分
user_indices = user_data.index.tolist()
user_ratings_list = user_data['rating'].values
# 归一化评分(1~5 -> 0.2~1.0),以便作为权重使用
normalized_ratings = (user_ratings_list - 1) / 4 # 将1~5映射到0~1
normalized_ratings += 0.1 # 避免权重为0
# 加权相似度:每个电影的相似度 * 用户对该电影的评分权重
weighted_similarities = cosine_sim[user_indices] * normalized_ratings[:, None]
# 求和得到最终综合相似度
user_similarities = weighted_similarities.sum(axis=0)
# 排序
movie_scores = list(enumerate(user_similarities))
movie_scores = sorted(movie_scores, key=lambda x: x[1], reverse=True)
# 推荐不在用户已经看过的电影
movie_ids = [i for i, _ in movie_scores if i not in user_indices]
print(movie_ids[:top_n])
recommended_movies = [movies['title'].iloc[i] for i in movie_ids]
return recommended_movies[:top_n]
print(recommend_movies_with_weights(1, ratings, movies, cosine_sim))
Item-协同过滤
Item-Based Collaborative Filtering
核心思想:“物以类聚”——喜欢A的人也喜欢B,那么A和B相似,就给喜欢A的人推荐B。
物品的共现矩阵
# @Author : ljl
# @Time : 2025/5/21 下午2:38
import pandas as pd
import numpy as np
ratings = pd.read_table('data/ratings.dat', sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')
dataset = {}
for row in ratings.itertuples():
user_id = row[1]
movie_id = row[2]
rating = row[3]
dataset.setdefault(user_id, {})
dataset[user_id][movie_id] = rating
# 构建共现矩阵 有3,900个电影,所以共现矩阵的维度为3,900*3,900
co_matrix = dict()
# 每个电影的评分次数
movie_rating_counts = dict()
for user, ratings in dataset.items():
for i in ratings.keys():
movie_rating_counts.setdefault(i, 0)
movie_rating_counts[i] += 1
for j in ratings:
if i == j:
continue
co_matrix.setdefault(i, dict())
co_matrix[i].setdefault(j, 0)
co_matrix[i][j] += 1
# 相似度矩阵
similar_matrix = dict()
for i, movie_id in co_matrix.items():
similar_matrix.setdefault(i, dict())
for j, cij in movie_id.items():
similar_matrix[i][j] = cij / (np.sqrt(movie_rating_counts[i] * movie_rating_counts[j]))
rank = dict()
user_rate_movie = dataset[1]
for i, movie_id in user_rate_movie.items():
top_n_movies = dict(sorted(similar_matrix[i].items(), key=lambda x: x[1], reverse=True)[:10])
for j, cij in top_n_movies.items():
if j in user_rate_movie:
continue
rank.setdefault(j, 0)
rank[j] += movie_id * cij
result = sorted(rank.items(), key=lambda x: x[1], reverse=True)
# 电影
movies = pd.read_table('data/movies.dat', sep='::', header=None, names=['movie_id', 'title', 'genres'], engine='python', encoding='ISO-8859-1')
print(movies[movies['movie_id'].isin([result[0][0], result[1][0], result[2][0]])])