Python 정규표현식 개념 정리

1️⃣ 기본 패턴

정규표현식의 기본 패턴들이다.

import re

# 기본 매칭
text = "Hello, World!"
match = re.search(r"World", text)
print(match.group())  # World

# 메타문자 사용
pattern = r"\d+"  # 하나 이상의 숫자
text = "There are 123 apples"
match = re.search(pattern, text)
print(match.group())  # 123

# 문자열 시작과 끝 매칭
text = "Python is amazing"
start_match = re.search(r"^Python", text)  # 문자열 시작이 "Python"인지 확인
print(start_match.group() if start_match else "No match")  # Python

end_match = re.search(r"amazing$", text)  # 문자열 끝이 "amazing"인지 확인
print(end_match.group() if end_match else "No match")  # amazing

# 대소문자 구분 없는 매칭
case_insensitive = re.search(r"python", text, re.IGNORECASE)
print(case_insensitive.group() if case_insensitive else "No match")  # Python

# 복수 옵션 적용
multiline_text = """First line
Second line
Third line"""
multiline_match = re.findall(r"^.*line", multiline_text, re.MULTILINE)
print(multiline_match)  # ['First line', 'Second line', 'Third line']

✅ 특징:

기본 문자열 매칭 및 패턴 검색
다양한 메타문자를 이용한 패턴 정의
문자열 위치 기반 매칭 (시작, 끝)
정규표현식 플래그 활용 (대소문자 무시, 멀티라인 등)
매칭 결과 추출 및 처리
다양한 상황에 맞는 패턴 적용
다양한 언어에서 공통적으로 사용되는 표준
텍스트 처리의 강력한 도구

2️⃣ 자주 사용되는 패턴

정규표현식에서 가장 흔하게 사용되는 메타문자와 패턴들이다.

import re

# 기본 메타문자
# \d: 숫자 [0-9]
# \D: 숫자가 아닌 문자 [^0-9]
# \w: 단어 문자 (알파벳, 숫자, 언더스코어) [a-zA-Z0-9_]
# \W: 단어 문자가 아닌 것 [^a-zA-Z0-9_]
# \s: 공백 문자 (스페이스, 탭, 줄바꿈 등)
# \S: 공백이 아닌 문자
# .: 임의의 한 문자 (줄바꿈 제외)

text = "abc123 def456 ghi789"

# 모든 숫자 찾기
numbers = re.findall(r"\d+", text)
print(numbers)  # ['123', '456', '789']

# 모든 문자 그룹 찾기
words = re.findall(r"\w+", text)
print(words)  # ['abc123', 'def456', 'ghi789']

# 문자와 숫자 분리하기
alphanumeric = re.findall(r"([a-z]+)(\d+)", text)
print(alphanumeric)  # [('abc', '123'), ('def', '456'), ('ghi', '789')]

# 문자 클래스
# [...]: 괄호 안의 어떤 문자든 매칭
# [^...]: 괄호 안의 문자들을 제외한 매칭
# [a-z]: 알파벳 소문자 범위
# [A-Z]: 알파벳 대문자 범위
# [0-9]: 숫자 범위
# [가-힣]: 한글 범위

# 모음만 찾기
vowels = re.findall(r"[aeiou]", "Hello World")
print(vowels)  # ['e', 'o', 'o']

# 숫자와 특수문자만 찾기
symbols = re.findall(r"[0-9!@#$%^&*]", "password123!")
print(symbols)  # ['1', '2', '3', '!']

# 한글 찾기
korean = re.findall(r"[가-힣]+", "Hello 안녕하세요 World")
print(korean)  # ['안녕하세요']

# 앵커
# ^: 문자열의 시작
# $: 문자열의 끝
# \b: 단어 경계
# \B: 단어 경계가 아닌 위치

# 단어 경계 활용
word_boundary = re.findall(r"\bcat\b", "The cat sat on the cats")
print(word_boundary)  # ['cat']

# 이메일 주소 검증
def is_valid_email(email):
    pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    return bool(re.match(pattern, email))

print(is_valid_email("[email protected]"))    # True
print(is_valid_email("invalid_email"))       # False
print(is_valid_email("user@domain"))         # False

# URL 추출
text_with_urls = "Visit our website at https://www.example.com or http://subdomain.example.org"
urls = re.findall(r"https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/\S*)?", text_with_urls)
print(urls)  # ['https://www.example.com', 'http://subdomain.example.org']

✅ 특징:

다양한 문자 클래스를 활용한 패턴 정의
특정 문자 집합이나 범위를 지정하는 방법
부정 패턴을 사용한 제외 매칭
단어 경계와 위치 앵커를 활용한 정확한 매칭
실용적인 정규표현식 패턴 예제 (이메일, URL 등)
복합적인 패턴 구성 방법
패턴의 가독성과 유지보수성 고려
다국어 텍스트 처리 방법

3️⃣ 반복 패턴

정규표현식에서 문자나 패턴의 반복을 처리하는 방법이다.

import re

# 주요 반복 수량자
# *: 0회 이상 반복 (0개 이상)
# +: 1회 이상 반복 (1개 이상)
# ?: 0회 또는 1회 반복 (선택적)
# {n}: 정확히 n회 반복
# {n,}: n회 이상 반복
# {n,m}: n회 이상 m회 이하 반복

text = "hello 123 world 456"

# * 수량자: 0회 이상
print(re.findall(r"he*llo", "hllo hello heello"))  # ['hllo', 'hello', 'heello']

# + 수량자: 1회 이상
print(re.findall(r"he+llo", "hllo hello heello"))  # ['hello', 'heello']

# ? 수량자: 0회 또는 1회
print(re.findall(r"colou?r", "color colour"))  # ['color', 'colour']

# {n} 정확한 반복 횟수
pattern = r"ca{2}t"
print(re.findall(pattern, "cat caat caaat"))  # ['caat']

# {n,} 최소 반복 횟수
pattern = r"ca{2,}t"
print(re.findall(pattern, "cat caat caaat caaaat"))  # ['caat', 'caaat', 'caaaat']

# {n,m} 반복 범위
pattern = r"ca{2,3}t"
print(re.findall(pattern, "cat caat caaat caaaat"))  # ['caat', 'caaat']

# 탐욕적 vs 비탐욕적 반복
greedy_pattern = r"<.+>"
non_greedy_pattern = r"<.+?>"
html = "<p>첫 번째 단락</p><p>두 번째 단락</p>"

# 탐욕적 매칭 (기본): 가능한 많이 매칭
print(re.findall(greedy_pattern, html))  # ['<p>첫 번째 단락</p><p>두 번째 단락</p>']

# 비탐욕적 매칭 (?): 가능한 적게 매칭
print(re.findall(non_greedy_pattern, html))  # ['<p>', '</p>', '<p>', '</p>']

# 복잡한 반복 패턴
# 전화번호 형식 검증 (예: 010-1234-5678 또는 010-123-4567)
phone_pattern = r"01[016789]-\d{3,4}-\d{4}"
phones = [
    "010-1234-5678",
    "011-123-4567",
    "016-123-4567",
    "010-12-345",  # 잘못된 형식
    "010-12345-6789"  # 잘못된 형식
]

for phone in phones:
    is_valid = re.match(phone_pattern, phone)
    print(f"{phone}: {'유효' if is_valid else '유효하지 않음'}")

# 반복 패턴을 활용한 단어 추출
sentence = "Python programming is fun and challenging. Python is powerful."
# Python 단어만 추출
print(re.findall(r"\bPython\b", sentence))  # ['Python', 'Python']

# 반복 패턴에 그룹 결합
date_text = "Date: 2023-05-15, Modified: 2023-06-30"
dates = re.findall(r"(\d{4})-(\d{2})-(\d{2})", date_text)
print(dates)  # [('2023', '05', '15'), ('2023', '06', '30')]

# 연속된 숫자 및 문자 패턴
text = "123abc456def789ghi"
# 숫자와 문자 패턴이 반복되는 경우
pattern = r"(\d+)([a-z]+)"
matches = re.findall(pattern, text)
print(matches)  # [('123', 'abc'), ('456', 'def'), ('789', 'ghi')]

✅ 특징:

다양한 반복 수량자를 사용한 패턴 매칭
정확한 반복 횟수 지정 및 범위 설정
탐욕적(greedy) 및 비탐욕적(non-greedy) 매칭 방식
복잡한 패턴에서 반복 제어 방법
실제 응용 사례(전화번호, 날짜 등) 검증
반복 패턴과 그룹의 결합
효율적인 반복 패턴 작성법
다양한 형식의 텍스트 패턴 매칭

4️⃣ 그룹과 참조

정규표현식에서 패턴의 일부를 그룹화하고 참조하는 방법이다.

import re

# 기본 그룹화 - 괄호() 사용
text = "Smith,John"
pattern = r"(\w+),(\w+)"
match = re.match(pattern, text)

if match:
    # 그룹 전체 매치
    print(match.group(0))  # Smith,John
    
    # 개별 그룹 접근
    print(match.group(1))  # Smith
    print(match.group(2))  # John
    
    # 모든 그룹 한 번에 가져오기
    print(match.groups())  # ('Smith', 'John')

# 이름 있는 그룹 (?P<name>pattern)
pattern = r"(?P<last>\w+),(?P<first>\w+)"
match = re.match(pattern, text)

if match:
    # 이름으로 그룹 접근
    print(match.group('last'))   # Smith
    print(match.group('first'))  # John
    
    # groupdict() 메서드로 사전 형태로 가져오기
    print(match.groupdict())  # {'last': 'Smith', 'first': 'John'}

# 그룹 내 선택 패턴 (|)
colors = "red green blue yellow"
# red 또는 green 또는 blue 매칭
color_pattern = r"(red|green|blue)"
color_matches = re.findall(color_pattern, colors)
print(color_matches)  # ['red', 'green', 'blue']

# 비캡처 그룹 (?:pattern)
# 매칭은 하지만 결과에 포함하지 않음
text = "apple and banana"
pattern = r"(?:apple|banana) and (\w+)"
match = re.search(pattern, text)
print(match.groups())  # ('banana',)

# 후방 참조 (백레퍼런스)
# \번호 또는 \g<번호>로 이전 그룹 참조
html = "<div>내용1</div><span>내용2</span>"

# 일반적인 방법: 태그 이름이 일치하지 않아도 매칭됨
print(re.findall(r"<(\w+)>(.*?)</(\w+)>", html))
# [('div', '내용1', 'div'), ('span', '내용2', 'span')]

# 후방 참조: 여는 태그와 닫는 태그가 일치해야 함
print(re.findall(r"<(\w+)>(.*?)</\1>", html))
# [('div', '내용1'), ('span', '내용2')]

# 이름 있는 그룹 참조 (?P=name)
pattern = r"<(?P<tag>\w+)>(.*?)</(?P=tag)>"
matches = re.findall(pattern, html)
print(matches)  # [('div', '내용1'), ('span', '내용2')]

# 문자열 치환에서 그룹 참조
text = "John Smith"
# 이름 형식 바꾸기 (First Last -> Last, First)
new_text = re.sub(r"(\w+) (\w+)", r"\2, \1", text)
print(new_text)  # Smith, John

# 중첩 그룹
address = "123 Main St, Anytown, CA 12345"
pattern = r"((\d+) ([A-Za-z\s]+), ([A-Za-z]+), ([A-Z]{2}) (\d{5}))"
match = re.search(pattern, address)

if match:
    # 전체 주소
    print(match.group(1))  # 123 Main St, Anytown, CA 12345
    # 번지수
    print(match.group(2))  # 123
    # 도로명
    print(match.group(3))  # Main St
    # 도시
    print(match.group(4))  # Anytown
    # 주
    print(match.group(5))  # CA
    # 우편번호
    print(match.group(6))  # 12345

# lookahead와 lookbehind 어설션
# 긍정형 lookahead (?=pattern): pattern이 뒤에 있어야 매칭
# 부정형 lookahead (?!pattern): pattern이 뒤에 없어야 매칭
# 긍정형 lookbehind (?<=pattern): pattern이 앞에 있어야 매칭
# 부정형 lookbehind (?<!pattern): pattern이 앞에 없어야 매칭

# 비밀번호 검증 (영문, 숫자, 특수문자 조합 8자 이상)
passwords = ["abc123", "Password1", "P@ssw0rd", "Simple"]

for pwd in passwords:
    # 긍정형 lookahead 사용
    has_length = re.search(r".{8,}", pwd) is not None
    has_upper = re.search(r"(?=.*[A-Z])", pwd) is not None
    has_digit = re.search(r"(?=.*\d)", pwd) is not None
    has_special = re.search(r"(?=.*[!@#$%^&*])", pwd) is not None
    
    is_valid = has_length and has_upper and has_digit and has_special
    print(f"{pwd}: {'유효' if is_valid else '유효하지 않음'}")
    # 결과: 'P@ssw0rd'만 유효

# 긍정형 lookbehind: $ 뒤의 숫자 추출
price_text = "Items: $10, $20, $30"
prices = re.findall(r"(?<=\$)\d+", price_text)
print(prices)  # ['10', '20', '30']

# 부정형 lookahead: 특정 패턴으로 끝나지 않는 단어
text = "apple, applet, application, apply"
not_ending_with_y = re.findall(r"\b\w+(?!y\b)", text)
print(not_ending_with_y)  # ['apple', 'applet', 'application', 'appl']

✅ 특징:

괄호를 사용한 패턴 일부 그룹화
인덱스 또는 이름으로 그룹 접근
비캡처 그룹으로 매칭은 하되 결과에서 제외
후방 참조로 동일 패턴 재사용
중첩 그룹으로 복잡한 패턴 구조화
정규식 치환에서 그룹 참조 활용
lookahead와 lookbehind 어설션으로 조건부 매칭
복잡한 텍스트 검증 및 추출 패턴 구현

KR_RegularExpression - somaz94/python-study GitHub Wiki

Python 정규표현식 개념 정리

1️⃣ 기본 패턴

2️⃣ 자주 사용되는 패턴

3️⃣ 반복 패턴

4️⃣ 그룹과 참조

⚠️ GitHub.com Fallback ⚠️

KR_RegularExpression - somaz94/python-study GitHub Wiki

Python 정규표현식 개념 정리

1️⃣ 기본 패턴

2️⃣ 자주 사용되는 패턴

3️⃣ 반복 패턴

4️⃣ 그룹과 참조

⚠️ **GitHub.com Fallback** ⚠️

⚠️ GitHub.com Fallback ⚠️