python regexp - ghdrako/doc_snipets GitHub Wiki
import re
print all([ # all of these are true, because
not re.match("a", "cat"), # * 'cat' doesn't start with 'a'"a", "cat"), # * 'cat' has an 'a' in it
not"c", "dog"), # * 'dog' doesn't have a 'c' in it
3 == len(re.split("[ab]", "carbs")), # * split on a or b to ['c','r','s']
"R-D-" == re.sub("[0-9]", "-", "R2D2") # * replace digits with dashes
]) # prints True
import re
regex = re.compile(r"[,_]")
separated_words2 = regex.split(messy_data)
import re
regex = re.compile("do")
regex.pattern"do homework")
regex.findall("don't do that")
import re"pattern", "the string to be searched")
re.findall("pattern", "the string to be searched")
Behind the scenes, when we call , Python creates the Pattern object for us and calls the search method on the pattern.
In OOP style when you use the compile function to create a Pattern object, the compiled pattern is cached in such a way that it’s more efficient to use the pattern multiple times because there is no need to compile the pattern the second time.
By contrast, the functional approach creates the pattern on the fly, so it doesn’t have the benefit of improved efficiency of the cached pattern.
task_pattern = re.compile("\\\\task") # \\task - both backshlash must be escaped
texts = ["\task", "\\task", "\\\task", "\\\\task"]
for text in texts:
print(f"Match {text!r}: {task_pattern.match(text)}")
# output the following lines:
Match '\task': None
Match '\\task': <re.Match object; span=(0, 5), match='\\task'>
Match '\\\task': None
Match '\\\\task': None
task_pattern_r = re.compile(r"\\task") # with raw string no need to escape special characters
texts = ["\task", "\\task", "\\\task", "\\\\task"]
for text in texts:
print(f"Match {text!r}: {task_pattern_r.match(text)}")
# output the following lines:
Match '\task': None
Match '\\task': <re.Match object; span=(0, 5), match='\\task'>
Match '\\\task': None
Match '\\\\task': None
any decimal digit -
any character that is not a decimal digit -
any whitespace, including space, \t, \n, \r, \f, \v -
any character that isn't a whitespace -
any word character, means alphanumeric plus underscores -
any character that is not a word character -
any character except a newline -
a set of defined characters
[] to define a character set:
- individual characters [abcxyz]
- range of characters. [a-z] [A-Z]
- ombine different ranges of characters [a-dw-z] a-d and w-z
- a|b a or b
- (abc) abc as a group
- [^a] any character other than a
match ="(\w\d)+", "xyza2b1c3dd")
# output: <re.Match object; span=(3, 9), match='a2b1c3'>
# output: matched: a2b1c3
print("span:", match.span())
# output: span: (3, 9)
print(f"start: {match.start()} & end: {match.end()}")
# output: start: 3 & end: 9
match = re.match("pattern", "string to match")
if match:
print("do something with the matched")
print("found no matches")
match = re.match(r"(\w+), (\w+)", "Homework, urgent; today")
# output: <re.Match object; span=(0, 16), match='Homework, urgent'>
# output: ('Homework', 'urgent')
# output: 'Homework, urgent'
# output: 'Homework'
# output: 'urgent'
# output: (0, 16)
# output: (0, 8)
# output: (10, 16)
Method | Code example | Match/return value |
search: Returns a Match if a match is found anywhere in the string. |"\d+", "ab12xy") |
'12' |
_ |"\d+", "abxy") |
None |
match: Returns a Match only if a match is found at the string’s beginning. | re.match(r"\d+", "ab12xy") |
None |
_ | re.match(r"\d+", "12abxy") |
'12' |
findall: Returns a list of strings that match the pattern. When the pattern has multiple groups, the item is a tuple. | re.findall(r"h[ie]\w", "hi hey hello") |
['hey', 'hel'] |
_ | re.findall(r"(h|H)(i|e)","Hey hello") |
[('H', 'e'), ('h', 'e')] |
finditer: Returns an iterator b that yields the Match objects. | re.finditer(r"(h|H)(i|e)","hi Hey hello") |
An iterator |
split: Splits the string by the pattern. | re.split(r"\d+",'a1b2c3d4e') |
['a', 'b', 'c', 'd', 'e'] |
sub: Creates a string by replacing the matched with the replacement. | re.sub(r"\D", "-",'123,456_789') |
'123-456-789' |
regex = re.compile(r"(\d{3}), (\w+); (.+)")
tasks = []
for line in text_data.split("\n"):
match = regex.match(line)
if match:
task = (,, # Creates a tuple from multiple groups
# output the following line
[('101', 'Homework', 'Complete physics and math'),
➥ ('102', 'Laundry', 'Wash all the clothes today'),
➥ ('103', 'Museum', 'All about Egypt')]
To name a group, you use the syntax (?P<group_name>pattern)
, in which you
name the pattern group as group_name . The name should be a valid Python identifier
because you must be able to retrieve it by calling the name.
regex = re.compile(r"(?P<task_id>\d{3}), (?P<task_title>\w+);
tasks = []
for line in text_data.split("\n"):
match = regex.match(line)
if match:
task = ('task_id'),'task_title'),