KR_CSV - somaz94/python-study GitHub Wiki
CSV(Comma-Separated Values)๋ ์ผํ๋ก ๊ตฌ๋ถ๋ ๋ฐ์ดํฐ๋ฅผ ์ ์ฅํ๋ ํ
์คํธ ํ์ผ ํ์์ด๋ค.
import csv
# CSV ํ์ผ ์ฝ๊ธฐ
with open('data.csv', 'r', encoding='utf-8') as file:
reader = csv.reader(file)
for row in reader:
print(row)
# CSV ํ์ผ ์ฐ๊ธฐ
data = [
['์ด๋ฆ', '๋์ด', '๋์'],
['John', '30', 'New York'],
['Alice', '25', 'London']
]
with open('output.csv', 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerows(data)
โ
ํน์ง:
- ๊ฐ๋จํ ๋ฐ์ดํฐ ๊ตฌ์กฐ
- ํ ์คํธ ๊ธฐ๋ฐ ํ์
- ๋ฒ์ฉ์ ํธํ์ฑ
๋์
๋๋ฆฌ ๊ธฐ๋ฐ์ผ๋ก CSV ๋ฐ์ดํฐ๋ฅผ ์ฒ๋ฆฌํ๋ ๋ฐฉ๋ฒ์ด๋ค.
# ๋์
๋๋ฆฌ ํํ๋ก ์ฝ๊ธฐ
with open('data.csv', 'r', encoding='utf-8') as file:
reader = csv.DictReader(file)
for row in reader:
print(f"์ด๋ฆ: {row['์ด๋ฆ']}, ๋์ด: {row['๋์ด']}")
# ๋์
๋๋ฆฌ ํํ๋ก ์ฐ๊ธฐ
data = [
{'์ด๋ฆ': 'John', '๋์ด': '30', '๋์': 'New York'},
{'์ด๋ฆ': 'Alice', '๋์ด': '25', '๋์': 'London'}
]
with open('output.csv', 'w', newline='', encoding='utf-8') as file:
fieldnames = ['์ด๋ฆ', '๋์ด', '๋์']
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
โ
ํน์ง:
- ํค๋ ๊ธฐ๋ฐ ์ ๊ทผ
- ๋์ ๋๋ฆฌ ํํ ์ฒ๋ฆฌ
- ์ง๊ด์ ์ธ ๋ฐ์ดํฐ ์ฒ๋ฆฌ
CSV ํ์์ ์ฌ์ฉ์ ์ ์ํ์ฌ ๋ค์ํ ํ
์คํธ ํ์ผ ํ์์ ์ฒ๋ฆฌํ ์ ์๋ค.
# ์ปค์คํ
๊ตฌ๋ถ์ ์ฌ์ฉ
with open('data.tsv', 'r', encoding='utf-8') as file:
reader = csv.reader(file, delimiter='\t')
for row in reader:
print(row)
# ์ปค์คํ
๋ฐ์ดํ ์ฒ๋ฆฌ
csv.register_dialect('custom',
delimiter=';',
quotechar='"',
quoting=csv.QUOTE_MINIMAL,
escapechar='\\'
)
with open('output.csv', 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file, dialect='custom')
writer.writerows(data)
โ
ํน์ง:
- ๊ตฌ๋ถ์ ์ค์
- ์ธ์ฉ ๋ถํธ ์ฒ๋ฆฌ
- ์ฌ์ฉ์ ์ ์ ํ์
CSV ๋ฐ์ดํฐ๋ฅผ ์ฝ๊ณ ์ฐ๋ ๊ณผ์ ์์ ๋ฐ์ดํฐ ๊ฒ์ฆ๊ณผ ๋ณํ์ ์ํํ ์ ์๋ค.
def validate_row(row):
try:
# ๋์ด ํ๋ ๊ฒ์ฆ
age = int(row['๋์ด'])
if age < 0 or age > 150:
raise ValueError("์ ํจํ์ง ์์ ๋์ด")
# ์ด๋ฉ์ผ ํ๋ ๊ฒ์ฆ
if '@' not in row['์ด๋ฉ์ผ']:
raise ValueError("์ ํจํ์ง ์์ ์ด๋ฉ์ผ")
return True
except Exception as e:
print(f"๊ฒ์ฆ ์คํจ: {e}")
return False
def process_csv(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as infile:
reader = csv.DictReader(infile)
valid_rows = [row for row in reader if validate_row(row)]
with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
writer.writeheader()
writer.writerows(valid_rows)
โ
ํน์ง:
- ๋ฐ์ดํฐ ์ ํจ์ฑ ๊ฒ์ฌ
- ์๋ ํ๋ณํ
- ์๋ฌ ์ฒ๋ฆฌ
๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ์ ์ต์ ํํ์ฌ ๋์ฉ๋ CSV ํ์ผ์ ํจ์จ์ ์ผ๋ก ์ฒ๋ฆฌํ๋ ๋ฐฉ๋ฒ์ด๋ค.
def process_large_csv(file_path, chunk_size=1000):
with open(file_path, 'r', encoding='utf-8') as file:
reader = csv.reader(file)
header = next(reader) # ํค๋ ์ฝ๊ธฐ
chunk = []
for row in reader:
chunk.append(row)
if len(chunk) >= chunk_size:
process_chunk(chunk)
chunk = []
# ๋จ์ ๋ฐ์ดํฐ ์ฒ๋ฆฌ
if chunk:
process_chunk(chunk)
โ
ํน์ง:
- ๋ฉ๋ชจ๋ฆฌ ํจ์จ์ ์ฒ๋ฆฌ
- ์ฒญํฌ ๋จ์ ์ฒ๋ฆฌ
- ๋์ฉ๋ ํ์ผ ์ง์
์ฌ๋ฌ CSV ํ์ผ์ ํ๋๋ก ํฉ์น๊ฑฐ๋ ํ๋์ ํ์ผ์ ์ฌ๋ฌ ๊ฐ๋ก ๋ถํ ํ๋ ๋ฐฉ๋ฒ์ด๋ค.
# ์ฌ๋ฌ CSV ํ์ผ ๋ณํฉ
def merge_csv_files(input_files, output_file):
with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
for i, file_path in enumerate(input_files):
with open(file_path, 'r', encoding='utf-8') as infile:
reader = csv.reader(infile)
# ์ฒซ ๋ฒ์งธ ํ์ผ์ ํค๋๋ง ์ฌ์ฉ
if i == 0:
header = next(reader)
writer = csv.writer(outfile)
writer.writerow(header)
else:
next(reader) # ํค๋ ๊ฑด๋๋ฐ๊ธฐ
# ๋ฐ์ดํฐ ๋ณต์ฌ
for row in reader:
writer.writerow(row)
# CSV ํ์ผ ๋ถํ
def split_csv_file(input_file, output_prefix, rows_per_file=1000):
with open(input_file, 'r', encoding='utf-8') as infile:
reader = csv.reader(infile)
header = next(reader)
file_number = 1
row_count = 0
current_out_file = open(f"{output_prefix}_{file_number}.csv", 'w', newline='', encoding='utf-8')
current_writer = csv.writer(current_out_file)
current_writer.writerow(header)
for row in reader:
current_writer.writerow(row)
row_count += 1
if row_count >= rows_per_file:
current_out_file.close()
file_number += 1
row_count = 0
current_out_file = open(f"{output_prefix}_{file_number}.csv", 'w', newline='', encoding='utf-8')
current_writer = csv.writer(current_out_file)
current_writer.writerow(header)
current_out_file.close()
โ
ํน์ง:
- ๋ค์ค ํ์ผ ์ฒ๋ฆฌ
- ํค๋ ๊ด๋ฆฌ
- ๋ถ์ฐ ์ฒ๋ฆฌ ๊ฐ๋ฅ
- ๋์ฉ๋ ๋ฐ์ดํฐ ๊ด๋ฆฌ
Pandas ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ฅผ ํ์ฉํ์ฌ ๊ณ ๊ธ CSV ๋ฐ์ดํฐ ์ฒ๋ฆฌ๋ฅผ ์ํํ ์ ์๋ค.
import pandas as pd
# CSV ํ์ผ์ DataFrame์ผ๋ก ์ฝ๊ธฐ
df = pd.read_csv('data.csv', encoding='utf-8')
# ๋ฐ์ดํฐ ํ์
print(df.head()) # ์ฒ์ 5ํ ๋ณด๊ธฐ
print(df.describe()) # ๊ธฐ๋ณธ ํต๊ณ
print(df.columns) # ์ด ์ด๋ฆ
# ๋ฐ์ดํฐ ํํฐ๋ง ๋ฐ ๋ณํ
filtered_df = df[df['๋์ด'] > 25]
df['์ ์ฒด์ด๋ฆ'] = df['์ฑ'] + ' ' + df['์ด๋ฆ']
# ๋ฐ์ดํฐ ๊ทธ๋ฃนํ ๋ฐ ์ง๊ณ
grouped = df.groupby('๋์').agg({
'๋์ด': ['mean', 'min', 'max', 'count'],
'๊ธ์ฌ': 'sum'
})
# CSV ํ์ผ๋ก ์ ์ฅ
filtered_df.to_csv('filtered_data.csv', index=False, encoding='utf-8')
โ
ํน์ง:
- ๋ค์ํ ๋ฐ์ดํฐ ๋ถ์ ๊ธฐ๋ฅ
- ๊ฐ๋ ฅํ ๋ฐ์ดํฐ ๋ณํ
- ๊ฐ๊ฒฐํ ๊ตฌ๋ฌธ
- ํต๊ณ ๋ฐ ์๊ฐํ ์ง์
- ๋์ฉ๋ ๋ฐ์ดํฐ ์ฒ๋ฆฌ ์ต์ ํ
โ
๋ชจ๋ฒ ์ฌ๋ก:
- ์ธ์ฝ๋ฉ ์ง์ (UTF-8)
- newline='' ์ค์ (์๋์ฐ ํ๊ฒฝ์์ ์ค๋ฐ๊ฟ ๋ฌธ์ ๋ฐฉ์ง)
- ํฐ ํ์ผ์ ์ฒญํฌ ๋จ์๋ก ์ฒ๋ฆฌ
- ๋ฐ์ดํฐ ๊ฒ์ฆ ๊ตฌํ
- ์๋ฌ ์ฒ๋ฆฌ ์ถ๊ฐ
- ๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ๋ ๊ณ ๋ ค
- pandas ํ์ฉ ๊ณ ๋ ค
- BOM(Byte Order Mark) ์ฒ๋ฆฌ ์ฃผ์
- ๋ ์ง์ ์๊ฐ ๋ฐ์ดํฐ๋ ํ์ค ํ์ ์ฌ์ฉ
- ์์ธ์ ์ธ ๊ตฌ๋ถ์์ ์ธ์ฉ ๋ฌธ์ ์ฒ๋ฆฌ ๋ฐฉ๋ฒ ์์ง
- CSV ํ์ผ ์์ฑ ์ ํค๋ ์ผ๊ด์ฑ ํ์ธ
- ๋ฏผ๊ฐ ์ ๋ณด๋ ๋ด๋ณด๋ด๊ธฐ ์ ๋ง์คํน ์ฒ๋ฆฌ
- ํฐ ํ์ผ์ ์์ถ ํ์(gzip, zip) ๊ณ ๋ ค
- ์คํค๋ง ๋ณ๊ฒฝ ์ ๊ธฐ์กด ๋ฐ์ดํฐ ํธํ์ฑ ๊ณ ๋ ค