python manage files - ghdrako/doc_snipets GitHub Wiki
create dir
from pathlib import Path
data_folder = Path("data")
data_folder.mkdir()
assert data_folder.exists()
create files
subject_ids = [123, 124, 125]
extensions = ["config", "dat", "txt"]
for subject_id in subject_ids:
for extension in extensions:
filename = f"subject_{subject_id}.{extension}"
filepath = data_folder / filename # Creates a file path
with open(filepath, "w") as file:
file.write(f"It's the file {filename}.")
When you create a filepath using directory_path / filename
, this operation is operating
system agnostic, meaning that the same code can run on either of these platforms.
This cross-platform compatibility is another advantage of using pathlib instead of the os module (in which you may have to use the raw strings as paths), which is platform dependent.
Retrieving the list of files of a specific kind
data_folder = Path("data")
data_files = data_folder.glob("*.dat") # Creates a generator object
print("Data files:", data_files)
for data_file in data_files:
print(f"Processing file: {data_file}")
# applicable data processing steps here
data_files = data_folder.glob("*.dat")
for data_file in sorted(data_files): # sort generator
print(f"Processing file: {data_file}")
# applicable data processing steps here
Moving files to a different folder
subject_ids = [123, 124, 125]
data_folder = Path("data")
for subject_id in subject_ids:
subject_folder = Path(f"subjects/subject_{subject_id}")
subject_folder.mkdir(parents=True, exist_ok=True) # Creates the subject folder
for subject_file in data_folder.glob(f"*{subject_id}*"):
filename = subject_file.name
target_path = subject_folder / filename # Constructs the target path
_ = subject_file.rename(target_path)
print(f"Moving {filename} to {target_path}")
Copying files to a different folder
import shutil
shutil.rmtree("subjects") # Removes a folder and its contents
subject_ids = [123, 124, 125]
data_folder = Path("data")
for subject_id in subject_ids:
subject_folder = Path(f"subjects/subject_{subject_id}")
subject_folder.mkdir(parents=True, exist_ok=True)
for subject_file in data_folder.glob(f"*{subject_id}*"):
filename = subject_file.name
target_path = subject_folder / filename
_ = shutil.copy(subject_file, target_path)
print(f"Copying {filename} to {target_path}")
we use the rmtree
function to remove a folder and its contents, as rmtree
doesn’t care about the directory’s emptiness. By contrast, we could run into a problem if we use Path.rmdir
to remove a directory that is not empty.
Path("subjects").rmdir()
# ERROR: OSError: [Errno 66] Directory not empty: 'subjects'
Delete files
data_folder = Path("data")
for file in data_folder.glob("*.txt"):
before = file.exists()
file.unlink()
after = file.exists()
print(f"Deleting {file}, existing? {before} -> {after}")
Retrieving filename information
from pathlib import Path
subjects_folder = Path("subjects")
for dat_path in subjects_folder.glob("**/*.dat"): # the pattern involves **/ , meaning that the files reside in subdirectories
subject_dir = dat_path.parent
filename = dat_path.stem
config_path = subject_dir / f"{filename}.config"
print(f"{subject_dir} & {filename} -> {config_path}")
dat_exists = dat_path.exists()
config_exists = config_path.exists()
with open(dat_path) as dat_file, open(config_path) as config_file:
print(f"Process {filename}: dat? {dat_exists}, config? {config_exists}\n")
# process the subject's data
Retrieving the file's size and time information
def process_data_using_size_cutoff(min_size, max_size):
data_folder = Path("data")
for dat_path in data_folder.glob("*.dat"):
filename = dat_path.name
size = dat_path.stat().st_size
if min_size < size < max_size:
print(f"{filename}, Good; {size}, within [{min_size}, {max_size}]")
else:
print(f"{filename}, Bad; {size}, outside [{min_size}, {max_size}]")
To retrieve time-related metadata, we can call the stat method on the Path instance:
import time
subject_dat_path = Path("data/subject_123.dat")
modified_time = subject_dat_path.stat().st_mtime
readable_time = time.ctime(modified_time)
print(f"Modification time: {modified_time} -> {readable_time}")
# output: Modification time: 1652123144.9999998 -> Mon May 9 14:05:44 2022