좋아요 데이터 생성 계획
- 전체 게시글 2400만건 중
- 40% 확률로 좋아요 생성
- 그 40% 중에서도, 50% 확률로 좋아요 1개 / 50% 확률로 좋아요 2개
- 기대 좋아요 수 : 2400(만) * 0.4 * 1.5 = 1440만건
좋아요 테이블 생성
CREATE TABLE IF NOT EXISTS article_likes(
article_like_id BIGINT NOT NULL PRIMARY KEY,
article_id BIGINT NOT NULL,
user_id BIGINT NOT NULL,
created_at DATETIME NOT NULL,
CONSTRAINT uq_article_id_and_user_id UNIQUE (article_id, user_id)
);
좋아요 - 유니크 인덱스 제거
ALTER TABLE article_likes DROP INDEX uq_article_id_and_user_id;
좋아요 데이터 생성
package com.ttasjwi.board.system.app.articlelike.data
import org.junit.jupiter.api.DisplayName
import org.junit.jupiter.api.Test
import java.io.BufferedWriter
import java.io.File
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
import java.util.Random
class ArticleLikeDataInitializer {
companion object {
private const val USER_COUNT = 12_000_000L
private const val ARTICLE_COUNT = 24_000_000L
private val baseTime = LocalDateTime.of(2025, 6, 10, 0, 0)
private val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
private var articleLikeId = 0L
}
@Test
@DisplayName("게시글 카테고리 벌크 삽입 csv 파일 생성")
fun initialize() {
val csvFile = File("article-likes.csv")
val random = Random()
csvFile.bufferedWriter().use { writer ->
// 2400만 게시글에 대해서
for (articleId in 1L..ARTICLE_COUNT) {
// 40% 확률로 좋아요, 40% 확률로 좋아요 안 함
if (shouldLike(random)) {
makeLike(articleId, random, writer)
}
}
}
}
private fun shouldLike(random: Random): Boolean {
val randomDouble = random.nextDouble()
// 40% 확률로 좋아요
return randomDouble < 0.4
}
private fun makeLike(articleId: Long, random: Random, writer:BufferedWriter) {
val randomDouble = random.nextDouble()
// 좋아요 1개
if (randomDouble < 0.5) {
val userId = generateRandomUserId(random)
writeArticleLike(++articleLikeId, articleId, userId, writer)
return
}
// 좋아요 2개
val userId1 = generateRandomUserId(random)
var userId2 = generateRandomUserId(random)
while (userId1 == userId2) {
userId2 = generateRandomUserId(random)
}
writeArticleLike(++articleLikeId, articleId, userId1, writer)
writeArticleLike(++articleLikeId, articleId, userId2, writer)
return
}
private fun generateRandomUserId(random: Random): Long {
return random.nextLong(USER_COUNT) + 1L
}
private fun writeArticleLike(articleLikeId: Long, articleId: Long, userId: Long, writer: BufferedWriter) {
writer.write("${articleLikeId},$articleId,$userId,${baseTime.plusSeconds(articleLikeId).format(formatter)}\n")
if (articleLikeId % 100_000L == 0L) {
println("Written $articleLikeId lines...")
}
}
}
좋아요 데이터 삽입
docker cp data/article-likes.csv mysql:/tmp/article-likes.csv
LOAD DATA INFILE '/tmp/article-likes.csv'
INTO TABLE article_likes
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
(article_like_id, article_id, user_id, created_at);
mysql> select count(*) from article_likes;
+----------+
| count(*) |
+----------+
| 14401350 |
+----------+
1 row in set (0.51 sec)
mysql> select * from article_likes where article_id = 7;
+-----------------+------------+---------+---------------------+
| article_like_id | article_id | user_id | created_at |
+-----------------+------------+---------+---------------------+
| 3 | 7 | 6036871 | 2025-06-10 00:00:03 |
| 4 | 7 | 427915 | 2025-06-10 00:00:04 |
+-----------------+------------+---------+---------------------+
2 rows in set (3.35 sec)
좋아요 테이블에 유니크 제약조건 다시 추가
ALTER TABLE article_likes
ADD CONSTRAINT uq_article_id_and_user_id UNIQUE (article_id, user_id);
좋아요수 테이블 생성
CREATE TABLE IF NOT EXISTS article_like_counts(
article_id BIGINT NOT NULL PRIMARY KEY,
like_count BIGINT NOT NULL
);
좋아요수 데이터 추가
INSERT INTO article_like_counts (article_id, like_count)
SELECT article_id, COUNT(*) AS like_count
FROM article_likes
GROUP BY article_id;
mysql> select * from article_like_counts limit 10 offset 10;
+------------+------------+
| article_id | like_count |
+------------+------------+
| 22 | 2 |
| 23 | 1 |
| 29 | 2 |
| 31 | 1 |
| 34 | 2 |
| 37 | 2 |
| 44 | 1 |
| 47 | 2 |
| 50 | 2 |
| 60 | 2 |
+------------+------------+
10 rows in set (0.00 sec)
싫어요 데이터 생성 계획
- 전체 게시글 2400만건 중
- 60% 확률로 싫어요 생성
- 그 60% 중에서도 50% 확률로 싫어요 1개 / 50% 확률로 싫어요 2개
- 그런데 이런 싫어요 대상 중에서도 '정보' 게시글은 싫어요가 불가능함. 따라서 싫어요 불가능한 카테고리에 해당하는 싫어요는 전부 제거해야함.
- 기대 싫어요 수 : 2400(만) * 0.6 * 1.5 * 2 / 3 = 약 1440(만)
싫어요 테이블 생성
CREATE TABLE IF NOT EXISTS article_dislikes(
article_dislike_id BIGINT NOT NULL PRIMARY KEY,
article_id BIGINT NOT NULL,
user_id BIGINT NOT NULL,
created_at DATETIME NOT NULL,
CONSTRAINT uq_article_id_and_user_id UNIQUE (article_id, user_id)
);
싫어요 유니크 인덱스 제거
ALTER TABLE article_dislikes DROP INDEX uq_article_id_and_user_id;
싫어요 데이터 생성
package com.ttasjwi.board.system.app.articlelike.data
import org.junit.jupiter.api.DisplayName
import org.junit.jupiter.api.Test
import java.io.BufferedWriter
import java.io.File
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
import java.util.Random
class ArticleDislikeDataInitializer {
companion object {
private const val USER_COUNT = 12_000_000L
private const val ARTICLE_COUNT = 24_000_000L
private val baseTime = LocalDateTime.of(2025, 6, 10, 0, 0)
private val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
private var articleDislikeId = 0L
}
@Test
@DisplayName("게시글 싫어요 데이터 csv 파일 생성")
fun initialize() {
val csvFile = File("article-dislikes.csv")
val random = Random()
csvFile.bufferedWriter().use { writer ->
// 2400만 게시글에 대해서
for (articleId in 1L..ARTICLE_COUNT) {
// 60% 확률로 싫어요, 40% 확률로 싫어요 안 함
if (shouldDislike(random)) {
makeDislike(articleId, random, writer)
}
}
}
}
private fun shouldDislike(random: Random): Boolean {
val randomDouble = random.nextDouble()
// 60% 확률로 싫어요
return randomDouble < 0.6
}
private fun makeDislike(articleId: Long, random: Random, writer:BufferedWriter) {
val randomDouble = random.nextDouble()
// 싫어요 1개
if (randomDouble < 0.5) {
val userId = generateRandomUserId(random)
writeArticleDislike(++articleDislikeId, articleId, userId, writer)
return
}
// 싫어요 2개
val userId1 = generateRandomUserId(random)
var userId2 = generateRandomUserId(random)
while (userId1 == userId2) {
userId2 = generateRandomUserId(random)
}
writeArticleDislike(++articleDislikeId, articleId, userId1, writer)
writeArticleDislike(++articleDislikeId, articleId, userId2, writer)
return
}
private fun generateRandomUserId(random: Random): Long {
return random.nextLong(USER_COUNT) + 1L
}
private fun writeArticleDislike(articleDislikeId: Long, articleId: Long, userId: Long, writer: BufferedWriter) {
writer.write("${articleDislikeId},$articleId,$userId,${baseTime.plusSeconds(articleDislikeId).format(formatter)}\n")
if (articleDislikeId % 100_000L == 0L) {
println("Written $articleDislikeId lines...")
}
}
}
싫어요 데이터 삽입
docker cp data/article-dislikes.csv mysql:/tmp/article-dislikes.csv
LOAD DATA INFILE '/tmp/article-dislikes.csv'
INTO TABLE article_dislikes
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
(article_dislike_id, article_id, user_id, created_at);
싫어요 - '정보' 카테고리의 '싫어요' 제거
WITH cte AS (
SELECT ad.article_id
FROM article_dislikes ad
JOIN articles a ON a.article_id = ad.article_id
WHERE a.article_category_id % 4 = 3
)
DELETE FROM article_dislikes
WHERE article_id IN (SELECT article_id FROM cte);
- 정보 카테고리는 카테고리 id 식별자를 4로 나눈 나머지가 3이다.
- 대상 '싫어요'들을 제거한다.
싫어요 데이터 확인
mysql> select count(*) from article_dislikes;
+----------+
| count(*) |
+----------+
| 14402405 |
+----------+
1 row in set (22.20 sec)
mysql> select ad.article_id, ad.user_id, a.article_category_id
-> from article_dislikes ad
-> join articles a on a.article_id = ad.article_id
-> where ad.article_id = 8;
+------------+----------+---------------------+
| article_id | user_id | article_category_id |
+------------+----------+---------------------+
| 8 | 3576857 | 2 |
| 8 | 11897123 | 2 |
+------------+----------+---------------------+
2 rows in set (0.01 sec)
싫어요 유니크 제약조건 추가
ALTER TABLE article_dislikes
ADD CONSTRAINT uq_article_id_and_user_id UNIQUE (article_id, user_id);
싫어요수 테이블 생성
CREATE TABLE IF NOT EXISTS article_like_counts(
article_id BIGINT NOT NULL PRIMARY KEY,
like_count BIGINT NOT NULL
);
싫어요수 데이터 추가
INSERT INTO article_dislike_counts (article_id, dislike_count)
SELECT article_id, COUNT(*) AS dislike_count
FROM article_dislikes
GROUP BY article_id;
mysql> select * from article_dislike_counts limit 10 offset 10;
+------------+---------------+
| article_id | dislike_count |
+------------+---------------+
| 26 | 1 |
| 30 | 2 |
| 31 | 1 |
| 35 | 1 |
| 37 | 1 |
| 38 | 2 |
| 39 | 1 |
| 40 | 2 |
| 42 | 1 |
| 43 | 1 |
+------------+---------------+
10 rows in set (0.00 sec)