4 분 소요


추천 시스템

코사인 유사도

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
In [2]:
data = pd.read_csv('datasets/movies_metadata.csv', low_memory=False)
data.head(2)
Out [2]:
adult belongs_to_collection budget genres homepage id imdb_id original_language original_title overview ... release_date revenue runtime spoken_languages status tagline title video vote_average vote_count
0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... http://toystory.disney.com/toy-story 862 tt0114709 en Toy Story Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] Released NaN Toy Story False 7.7 5415.0
1 False NaN 65000000 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... NaN 8844 tt0113497 en Jumanji When siblings Judy and Peter discover an encha... ... 1995-12-15 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released Roll the dice and unleash the excitement! Jumanji False 6.9 2413.0

2 rows × 24 columns

In [3]:
data.info()
Out [3]:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  revenue                45460 non-null  float64
 16  runtime                45203 non-null  float64
 17  spoken_languages       45460 non-null  object 
 18  status                 45379 non-null  object 
 19  tagline                20412 non-null  object 
 20  title                  45460 non-null  object 
 21  video                  45460 non-null  object 
 22  vote_average           45460 non-null  float64
 23  vote_count             45460 non-null  float64
dtypes: float64(4), object(20)
memory usage: 8.3+ MB

In [4]:
data[['title', 'overview']]
Out [4]:
title overview
0 Toy Story Led by Woody, Andy's toys live happily in his ...
1 Jumanji When siblings Judy and Peter discover an encha...
2 Grumpier Old Men A family wedding reignites the ancient feud be...
3 Waiting to Exhale Cheated on, mistreated and stepped on, the wom...
4 Father of the Bride Part II Just when George Banks has recovered from his ...
... ... ...
45461 Subdue Rising and falling between a man and woman.
45462 Century of Birthing An artist struggles to finish his work while a...
45463 Betrayal When one of her hits goes wrong, a professiona...
45464 Satan Triumphant In a small town live two brothers, one a minis...
45465 Queerama 50 years after decriminalisation of homosexual...

45466 rows × 2 columns

In [5]:
data['overview'].isnull().sum()
Out [5]:
954
In [6]:
data = data[['title', 'overview']]
In [7]:
data.info()
Out [7]:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     45460 non-null  object
 1   overview  44512 non-null  object
dtypes: object(2)
memory usage: 710.5+ KB

In [8]:
data.dropna(inplace=True)
data.info()
Out [8]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 44506 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     44506 non-null  object
 1   overview  44506 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB

In [9]:
ftidf_vec = TfidfVectorizer(stop_words='english')
tfidf_dtm = ftidf_vec.fit_transform(data['overview'])
tfidf_dtm # 44506개의 리뷰에 대해 75827 단어의 tfidf 희소 행렬
Out [9]:
<44506x75827 sparse matrix of type '<class 'numpy.float64'>'
	with 1210839 stored elements in Compressed Sparse Row format>
In [10]:
cos_sim_res = cosine_similarity(tfidf_dtm, tfidf_dtm)
In [11]:
# 타이틀을 정수화(index)
title_to_index = dict(zip(data['title'], data.index))
# 'Toy Story': 0, 'Jumanji': 1, 'Grumpier Old Men': 2...
In [12]:
def get_recommendation(title, n):
    idx = title_to_index[title]
    sim_scores = list(enumerate(cos_sim_res[idx])) # 해당 영화와 나머지 영화들의 유사도를 idx와 함께 list로
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # 유사도를 기준(key)으로 내림차순 정렬
    sim_scores_n = sim_scores[1:n+1] # 유사도가 동일(title==title)한 첫번째를 제외하고 n번째까지
    movie_idx = [movie_dict[0] for movie_dict in sim_scores_n]
    return data['title'].iloc[movie_idx]
In [13]:
get_recommendation('Toy Story', 5)
Out [13]:
15348                    Toy Story 3
2997                     Toy Story 2
10301         The 40 Year Old Virgin
24523                      Small Fry
23843    Andy Hardy's Blonde Trouble
Name: title, dtype: object
In [14]:
get_recommendation('Batman', 5)
Out [14]:
8681                 Scars of Dracula
18562    Johnny Cash at Folsom Prison
6600        The Prince and the Pauper
5355            Nosferatu the Vampyre
37915                          Vaesen
Name: title, dtype: object
In [15]:
get_recommendation('The Dark Knight Rises', 5)
Out [15]:
31143             Deadly Daycare
19286            The One Percent
44918                  Once More
45106    Nicostratos the Pelican
33008       White Cannibal Queen
Name: title, dtype: object

word2vec, 임베딩

딥러닝과 잘 어울림
sklearn의 TfidfVectorizer는 머신러닝과 잘 어울림

In [16]:
import pandas as pd
In [17]:
train_data = pd.read_table('datasets/ratings.txt')
train_data.head()
Out [17]:
id document label
0 8112052 어릴때보고 지금다시봐도 재밌어요ㅋㅋ 1
1 8132799 디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산... 1
2 4655635 폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고. 1
3 9251303 와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런... 1
4 10067386 안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화. 1
In [18]:
train_data.info()
Out [18]:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        200000 non-null  int64 
 1   document  199992 non-null  object
 2   label     200000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.6+ MB

In [19]:
train_data.dropna(inplace=True)
train_data.info()
Out [19]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 199992 entries, 0 to 199999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        199992 non-null  int64 
 1   document  199992 non-null  object
 2   label     199992 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 6.1+ MB

In [20]:
# 한글 이외 제거
train_data['document'] = train_data['document'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '') # ㄱ-ㅎ,ㅏ-ㅣ,가-힣,공백 제외
Out [20]:
C:\Users\user\AppData\Local\Temp\ipykernel_14604\1359601085.py:2: FutureWarning: The default value of regex will change from True to False in a future version.
  train_data['document'] = train_data['document'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '') # ㄱ-ㅎ,ㅏ-ㅣ,가-힣,공백 제외

In [21]:
# ㄱ-ㅣ는 가능하지만 ㄱ-힣은 가능하지 않다.
print(ord('ㄱ'), ord('ㅎ'))
print(ord('ㅏ'), ord('ㅣ'))
print(ord('가'), ord('힣'))
Out [21]:
12593 12622
12623 12643
44032 55203

In [22]:
from konlpy.tag import Okt
okt = Okt()
In [23]:
stopwords = ["의", "가", "이", "은", "는",
             "들", "좀", "잘", "강", "과",
             "도", "를", "으로", "자", "에",
             "와", "한", "하다"]
In [24]:
from tqdm import tqdm
toked_data=[]
for sen in tqdm(train_data['document']):
    toked_sen = okt.morphs(sen, stem=True)
    toked_sen_wo_stop = [word for word in toked_sen if not word in stopwords]
    toked_data.append(toked_sen_wo_stop)
Out [24]:
100%|█████████████████████████████████████████████████████████████████████████| 199992/199992 [09:49<00:00, 339.43it/s]

In [25]:
# 리뷰 최대 길이
max(len(review) for review in toked_data)
Out [25]:
72
In [26]:
# 리뷰 최소 길이
min(len(review) for review in toked_data)
Out [26]:
0
In [27]:
# 리뷰 평균 길이
sum(len(review) for review in toked_data)/len(toked_data)
Out [27]:
10.719648785951438
In [28]:
import matplotlib.pyplot as plt
# 히스토그램 그려보기
plt.hist([len(review) for review in toked_data], bins=20)
plt.show()
Out [28]:

img

In [29]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=toked_data, vector_size=100, window=5, min_count=5, workers=4, sg=0)
model.wv.vectors.shape # 임베딩 행렬(단어수, 표현력) 단어당 100차원의 특성을 저장
Out [29]:
(16477, 100)
In [30]:
model.wv.vectors
Out [30]:
array([[-2.8263861e-03,  9.8568387e-02, -9.4999635e-01, ...,
        -8.6086142e-01,  1.3070868e-01,  8.3628201e-01],
       [-1.3327576e+00,  5.4933226e-01, -4.0492460e-01, ...,
        -9.6057999e-01, -1.1362101e+00,  6.1550405e-02],
       [-1.5236272e+00,  2.4189599e-01, -1.3285520e+00, ...,
        -8.9836246e-01,  2.8833647e+00,  1.5803616e-01],
       ...,
       [-3.3974819e-02, -1.4981081e-02,  5.9175860e-02, ...,
        -8.6746484e-02,  4.4356149e-02, -6.0275033e-02],
       [-2.4699664e-02,  3.8863931e-02, -7.7239783e-03, ...,
        -1.0443001e-01, -2.6425140e-02, -9.8955519e-03],
       [-1.2799753e-02,  2.7629824e-02, -1.4718298e-03, ...,
        -4.9694967e-02,  1.4989703e-02,  3.4072924e-02]], dtype=float32)
In [31]:
model.wv.most_similar('송혜교')
Out [31]:
[('이종석', 0.892828643321991),
 ('남상미', 0.8924018740653992),
 ('김소연', 0.8526328206062317),
 ('이민정', 0.8493728041648865),
 ('신현준', 0.8422029614448547),
 ('주지훈', 0.8388743996620178),
 ('손예진', 0.8374811410903931),
 ('송윤아', 0.8355734944343567),
 ('서우', 0.8328922390937805),
 ('김동완', 0.8306312561035156)]

사전 학습된 모델

In [32]:
import gensim
pre_trained_word2vec = gensim.models.KeyedVectors.load_word2vec_format('datasets/GoogleNews-vectors-negative300.bin', binary=True)
In [33]:
pre_trained_word2vec.vectors.shape # 단어:3000000, 차원:300
Out [33]:
(3000000, 300)
In [34]:
pre_trained_word2vec.similarity('this', 'is')
Out [34]:
0.40797037
In [35]:
pre_trained_word2vec.similarity('food', 'book')
Out [35]:
0.10171259
In [36]:
pre_trained_word2vec.similarity('chicken', 'pizza')
Out [36]:
0.37984928

태그: ,

카테고리:

업데이트:

댓글남기기