KR_CSV - somaz94/python-study GitHub Wiki

Python CSV ์ฒ˜๋ฆฌ ๊ฐœ๋… ์ •๋ฆฌ


1๏ธโƒฃ CSV ๊ธฐ์ดˆ

CSV(Comma-Separated Values)๋Š” ์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„๋œ ๋ฐ์ดํ„ฐ๋ฅผ ์ €์žฅํ•˜๋Š” ํ…์ŠคํŠธ ํŒŒ์ผ ํ˜•์‹์ด๋‹ค.

import csv

# CSV ํŒŒ์ผ ์ฝ๊ธฐ
with open('data.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        print(row)

# CSV ํŒŒ์ผ ์“ฐ๊ธฐ
data = [
    ['์ด๋ฆ„', '๋‚˜์ด', '๋„์‹œ'],
    ['John', '30', 'New York'],
    ['Alice', '25', 'London']
]

with open('output.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerows(data)

โœ… ํŠน์ง•:

  • ๊ฐ„๋‹จํ•œ ๋ฐ์ดํ„ฐ ๊ตฌ์กฐ
  • ํ…์ŠคํŠธ ๊ธฐ๋ฐ˜ ํ˜•์‹
  • ๋ฒ”์šฉ์  ํ˜ธํ™˜์„ฑ


2๏ธโƒฃ DictReader์™€ DictWriter

๋”•์…”๋„ˆ๋ฆฌ ๊ธฐ๋ฐ˜์œผ๋กœ CSV ๋ฐ์ดํ„ฐ๋ฅผ ์ฒ˜๋ฆฌํ•˜๋Š” ๋ฐฉ๋ฒ•์ด๋‹ค.

# ๋”•์…”๋„ˆ๋ฆฌ ํ˜•ํƒœ๋กœ ์ฝ๊ธฐ
with open('data.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        print(f"์ด๋ฆ„: {row['์ด๋ฆ„']}, ๋‚˜์ด: {row['๋‚˜์ด']}")

# ๋”•์…”๋„ˆ๋ฆฌ ํ˜•ํƒœ๋กœ ์“ฐ๊ธฐ
data = [
    {'์ด๋ฆ„': 'John', '๋‚˜์ด': '30', '๋„์‹œ': 'New York'},
    {'์ด๋ฆ„': 'Alice', '๋‚˜์ด': '25', '๋„์‹œ': 'London'}
]

with open('output.csv', 'w', newline='', encoding='utf-8') as file:
    fieldnames = ['์ด๋ฆ„', '๋‚˜์ด', '๋„์‹œ']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

โœ… ํŠน์ง•:

  • ํ—ค๋” ๊ธฐ๋ฐ˜ ์ ‘๊ทผ
  • ๋”•์…”๋„ˆ๋ฆฌ ํ˜•ํƒœ ์ฒ˜๋ฆฌ
  • ์ง๊ด€์ ์ธ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ


3๏ธโƒฃ CSV ํŒŒ์ผ ํฌ๋งท ์„ค์ •

CSV ํ˜•์‹์„ ์‚ฌ์šฉ์ž ์ •์˜ํ•˜์—ฌ ๋‹ค์–‘ํ•œ ํ…์ŠคํŠธ ํŒŒ์ผ ํ˜•์‹์„ ์ฒ˜๋ฆฌํ•  ์ˆ˜ ์žˆ๋‹ค.

# ์ปค์Šคํ…€ ๊ตฌ๋ถ„์ž ์‚ฌ์šฉ
with open('data.tsv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter='\t')
    for row in reader:
        print(row)

# ์ปค์Šคํ…€ ๋”ฐ์˜ดํ‘œ ์ฒ˜๋ฆฌ
csv.register_dialect('custom', 
    delimiter=';',
    quotechar='"',
    quoting=csv.QUOTE_MINIMAL,
    escapechar='\\'
)

with open('output.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file, dialect='custom')
    writer.writerows(data)

โœ… ํŠน์ง•:

  • ๊ตฌ๋ถ„์ž ์„ค์ •
  • ์ธ์šฉ ๋ถ€ํ˜ธ ์ฒ˜๋ฆฌ
  • ์‚ฌ์šฉ์ž ์ •์˜ ํ˜•์‹


4๏ธโƒฃ ๋ฐ์ดํ„ฐ ๊ฒ€์ฆ๊ณผ ๋ณ€ํ™˜

CSV ๋ฐ์ดํ„ฐ๋ฅผ ์ฝ๊ณ  ์“ฐ๋Š” ๊ณผ์ •์—์„œ ๋ฐ์ดํ„ฐ ๊ฒ€์ฆ๊ณผ ๋ณ€ํ™˜์„ ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์žˆ๋‹ค.

def validate_row(row):
    try:
        # ๋‚˜์ด ํ•„๋“œ ๊ฒ€์ฆ
        age = int(row['๋‚˜์ด'])
        if age < 0 or age > 150:
            raise ValueError("์œ ํšจํ•˜์ง€ ์•Š์€ ๋‚˜์ด")
        
        # ์ด๋ฉ”์ผ ํ•„๋“œ ๊ฒ€์ฆ
        if '@' not in row['์ด๋ฉ”์ผ']:
            raise ValueError("์œ ํšจํ•˜์ง€ ์•Š์€ ์ด๋ฉ”์ผ")
        
        return True
    except Exception as e:
        print(f"๊ฒ€์ฆ ์‹คํŒจ: {e}")
        return False

def process_csv(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        valid_rows = [row for row in reader if validate_row(row)]
        
    with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
        writer.writeheader()
        writer.writerows(valid_rows)

โœ… ํŠน์ง•:

  • ๋ฐ์ดํ„ฐ ์œ ํšจ์„ฑ ๊ฒ€์‚ฌ
  • ์ž๋™ ํ˜•๋ณ€ํ™˜
  • ์—๋Ÿฌ ์ฒ˜๋ฆฌ


5๏ธโƒฃ ๋Œ€์šฉ๋Ÿ‰ CSV ์ฒ˜๋ฆฌ

๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ์„ ์ตœ์ ํ™”ํ•˜์—ฌ ๋Œ€์šฉ๋Ÿ‰ CSV ํŒŒ์ผ์„ ํšจ์œจ์ ์œผ๋กœ ์ฒ˜๋ฆฌํ•˜๋Š” ๋ฐฉ๋ฒ•์ด๋‹ค.

def process_large_csv(file_path, chunk_size=1000):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # ํ—ค๋” ์ฝ๊ธฐ
        
        chunk = []
        for row in reader:
            chunk.append(row)
            
            if len(chunk) >= chunk_size:
                process_chunk(chunk)
                chunk = []
        
        # ๋‚จ์€ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
        if chunk:
            process_chunk(chunk)

โœ… ํŠน์ง•:

  • ๋ฉ”๋ชจ๋ฆฌ ํšจ์œจ์  ์ฒ˜๋ฆฌ
  • ์ฒญํฌ ๋‹จ์œ„ ์ฒ˜๋ฆฌ
  • ๋Œ€์šฉ๋Ÿ‰ ํŒŒ์ผ ์ง€์›


6๏ธโƒฃ CSV ํŒŒ์ผ ๋ณ‘ํ•ฉ ๋ฐ ๋ถ„ํ• 

์—ฌ๋Ÿฌ CSV ํŒŒ์ผ์„ ํ•˜๋‚˜๋กœ ํ•ฉ์น˜๊ฑฐ๋‚˜ ํ•˜๋‚˜์˜ ํŒŒ์ผ์„ ์—ฌ๋Ÿฌ ๊ฐœ๋กœ ๋ถ„ํ• ํ•˜๋Š” ๋ฐฉ๋ฒ•์ด๋‹ค.

# ์—ฌ๋Ÿฌ CSV ํŒŒ์ผ ๋ณ‘ํ•ฉ
def merge_csv_files(input_files, output_file):
    with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        for i, file_path in enumerate(input_files):
            with open(file_path, 'r', encoding='utf-8') as infile:
                reader = csv.reader(infile)
                
                # ์ฒซ ๋ฒˆ์งธ ํŒŒ์ผ์˜ ํ—ค๋”๋งŒ ์‚ฌ์šฉ
                if i == 0:
                    header = next(reader)
                    writer = csv.writer(outfile)
                    writer.writerow(header)
                else:
                    next(reader)  # ํ—ค๋” ๊ฑด๋„ˆ๋›ฐ๊ธฐ
                
                # ๋ฐ์ดํ„ฐ ๋ณต์‚ฌ
                for row in reader:
                    writer.writerow(row)

# CSV ํŒŒ์ผ ๋ถ„ํ• 
def split_csv_file(input_file, output_prefix, rows_per_file=1000):
    with open(input_file, 'r', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        header = next(reader)
        
        file_number = 1
        row_count = 0
        
        current_out_file = open(f"{output_prefix}_{file_number}.csv", 'w', newline='', encoding='utf-8')
        current_writer = csv.writer(current_out_file)
        current_writer.writerow(header)
        
        for row in reader:
            current_writer.writerow(row)
            row_count += 1
            
            if row_count >= rows_per_file:
                current_out_file.close()
                file_number += 1
                row_count = 0
                
                current_out_file = open(f"{output_prefix}_{file_number}.csv", 'w', newline='', encoding='utf-8')
                current_writer = csv.writer(current_out_file)
                current_writer.writerow(header)
        
        current_out_file.close()

โœ… ํŠน์ง•:

  • ๋‹ค์ค‘ ํŒŒ์ผ ์ฒ˜๋ฆฌ
  • ํ—ค๋” ๊ด€๋ฆฌ
  • ๋ถ„์‚ฐ ์ฒ˜๋ฆฌ ๊ฐ€๋Šฅ
  • ๋Œ€์šฉ๋Ÿ‰ ๋ฐ์ดํ„ฐ ๊ด€๋ฆฌ


7๏ธโƒฃ Pandas์™€ ํ•จ๊ป˜ ์‚ฌ์šฉ

Pandas ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ํ™œ์šฉํ•˜์—ฌ ๊ณ ๊ธ‰ CSV ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ๋ฅผ ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์žˆ๋‹ค.

import pandas as pd

# CSV ํŒŒ์ผ์„ DataFrame์œผ๋กœ ์ฝ๊ธฐ
df = pd.read_csv('data.csv', encoding='utf-8')

# ๋ฐ์ดํ„ฐ ํƒ์ƒ‰
print(df.head())  # ์ฒ˜์Œ 5ํ–‰ ๋ณด๊ธฐ
print(df.describe())  # ๊ธฐ๋ณธ ํ†ต๊ณ„
print(df.columns)  # ์—ด ์ด๋ฆ„

# ๋ฐ์ดํ„ฐ ํ•„ํ„ฐ๋ง ๋ฐ ๋ณ€ํ™˜
filtered_df = df[df['๋‚˜์ด'] > 25]
df['์ „์ฒด์ด๋ฆ„'] = df['์„ฑ'] + ' ' + df['์ด๋ฆ„']

# ๋ฐ์ดํ„ฐ ๊ทธ๋ฃนํ™” ๋ฐ ์ง‘๊ณ„
grouped = df.groupby('๋„์‹œ').agg({
    '๋‚˜์ด': ['mean', 'min', 'max', 'count'],
    '๊ธ‰์—ฌ': 'sum'
})

# CSV ํŒŒ์ผ๋กœ ์ €์žฅ
filtered_df.to_csv('filtered_data.csv', index=False, encoding='utf-8')

โœ… ํŠน์ง•:

  • ๋‹ค์–‘ํ•œ ๋ฐ์ดํ„ฐ ๋ถ„์„ ๊ธฐ๋Šฅ
  • ๊ฐ•๋ ฅํ•œ ๋ฐ์ดํ„ฐ ๋ณ€ํ™˜
  • ๊ฐ„๊ฒฐํ•œ ๊ตฌ๋ฌธ
  • ํ†ต๊ณ„ ๋ฐ ์‹œ๊ฐํ™” ์ง€์›
  • ๋Œ€์šฉ๋Ÿ‰ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์ตœ์ ํ™”


์ฃผ์š” ํŒ

โœ… ๋ชจ๋ฒ” ์‚ฌ๋ก€:

  • ์ธ์ฝ”๋”ฉ ์ง€์ • (UTF-8)
  • newline='' ์„ค์ • (์œˆ๋„์šฐ ํ™˜๊ฒฝ์—์„œ ์ค„๋ฐ”๊ฟˆ ๋ฌธ์ œ ๋ฐฉ์ง€)
  • ํฐ ํŒŒ์ผ์€ ์ฒญํฌ ๋‹จ์œ„๋กœ ์ฒ˜๋ฆฌ
  • ๋ฐ์ดํ„ฐ ๊ฒ€์ฆ ๊ตฌํ˜„
  • ์—๋Ÿฌ ์ฒ˜๋ฆฌ ์ถ”๊ฐ€
  • ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰ ๊ณ ๋ ค
  • pandas ํ™œ์šฉ ๊ณ ๋ ค
  • BOM(Byte Order Mark) ์ฒ˜๋ฆฌ ์ฃผ์˜
  • ๋‚ ์งœ์™€ ์‹œ๊ฐ„ ๋ฐ์ดํ„ฐ๋Š” ํ‘œ์ค€ ํ˜•์‹ ์‚ฌ์šฉ
  • ์˜ˆ์™ธ์ ์ธ ๊ตฌ๋ถ„์ž์™€ ์ธ์šฉ ๋ฌธ์ž ์ฒ˜๋ฆฌ ๋ฐฉ๋ฒ• ์ˆ™์ง€
  • CSV ํŒŒ์ผ ์ž‘์„ฑ ์ „ ํ—ค๋” ์ผ๊ด€์„ฑ ํ™•์ธ
  • ๋ฏผ๊ฐ ์ •๋ณด๋Š” ๋‚ด๋ณด๋‚ด๊ธฐ ์ „ ๋งˆ์Šคํ‚น ์ฒ˜๋ฆฌ
  • ํฐ ํŒŒ์ผ์€ ์••์ถ• ํ˜•์‹(gzip, zip) ๊ณ ๋ ค
  • ์Šคํ‚ค๋งˆ ๋ณ€๊ฒฝ ์‹œ ๊ธฐ์กด ๋ฐ์ดํ„ฐ ํ˜ธํ™˜์„ฑ ๊ณ ๋ ค


โš ๏ธ **GitHub.com Fallback** โš ๏ธ