python manage files - ghdrako/doc_snipets GitHub Wiki

create dir

from pathlib import Path
data_folder = Path("data")
data_folder.mkdir() 
assert data_folder.exists()

create files

subject_ids = [123, 124, 125]
extensions = ["config", "dat", "txt"]
for subject_id in subject_ids:
  for extension in extensions:
    filename = f"subject_{subject_id}.{extension}"
    filepath = data_folder / filename # Creates a file path
    with open(filepath, "w") as file:
      file.write(f"It's the file {filename}.")

When you create a filepath using directory_path / filename , this operation is operating system agnostic, meaning that the same code can run on either of these platforms.

This cross-platform compatibility is another advantage of using pathlib instead of the os module (in which you may have to use the raw strings as paths), which is platform dependent.

Retrieving the list of files of a specific kind

data_folder = Path("data")
data_files = data_folder.glob("*.dat")  # Creates a generator object
print("Data files:", data_files)
for data_file in data_files:
  print(f"Processing file: {data_file}")
# applicable data processing steps here

data_files = data_folder.glob("*.dat")
for data_file in sorted(data_files):   # sort generator
  print(f"Processing file: {data_file}")
  # applicable data processing steps here

Moving files to a different folder

subject_ids = [123, 124, 125]
data_folder = Path("data")
for subject_id in subject_ids:
  subject_folder = Path(f"subjects/subject_{subject_id}")
  subject_folder.mkdir(parents=True, exist_ok=True)       # Creates the subject folder
  for subject_file in data_folder.glob(f"*{subject_id}*"):
    filename = subject_file.name
    target_path = subject_folder / filename               # Constructs the target path
    _ = subject_file.rename(target_path)
    print(f"Moving {filename} to {target_path}")

Copying files to a different folder

import shutil
shutil.rmtree("subjects")  # Removes a folder and its contents
subject_ids = [123, 124, 125]
data_folder = Path("data")
for subject_id in subject_ids:
  subject_folder = Path(f"subjects/subject_{subject_id}")
  subject_folder.mkdir(parents=True, exist_ok=True)
  for subject_file in data_folder.glob(f"*{subject_id}*"):
    filename = subject_file.name
    target_path = subject_folder / filename
    _ = shutil.copy(subject_file, target_path)
    print(f"Copying {filename} to {target_path}")

we use the rmtree function to remove a folder and its contents, as rmtree doesn’t care about the directory’s emptiness. By contrast, we could run into a problem if we use Path.rmdir to remove a directory that is not empty.

Path("subjects").rmdir()
# ERROR: OSError: [Errno 66] Directory not empty: 'subjects'

Delete files

data_folder = Path("data")
for file in data_folder.glob("*.txt"):
  before = file.exists()
  file.unlink()
  after = file.exists()
  print(f"Deleting {file}, existing? {before} -> {after}")

Retrieving filename information

from pathlib import Path
subjects_folder = Path("subjects")
for dat_path in subjects_folder.glob("**/*.dat"):  # the pattern involves **/ , meaning that the files reside in subdirectories
  subject_dir = dat_path.parent
  filename = dat_path.stem
  config_path = subject_dir / f"{filename}.config"
  print(f"{subject_dir} & {filename} -> {config_path}")
  dat_exists = dat_path.exists()
  config_exists = config_path.exists()
  with open(dat_path) as dat_file, open(config_path) as config_file:
    print(f"Process {filename}: dat? {dat_exists}, config? {config_exists}\n")
    # process the subject's data

Retrieving the file's size and time information

def process_data_using_size_cutoff(min_size, max_size):
data_folder = Path("data")
for dat_path in data_folder.glob("*.dat"):
  filename = dat_path.name
  size = dat_path.stat().st_size
  if min_size < size < max_size:
    print(f"{filename}, Good; {size}, within [{min_size}, {max_size}]")
  else:
    print(f"{filename}, Bad; {size}, outside [{min_size}, {max_size}]")

To retrieve time-related metadata, we can call the stat method on the Path instance:

import time
subject_dat_path = Path("data/subject_123.dat")
modified_time = subject_dat_path.stat().st_mtime
readable_time = time.ctime(modified_time)
print(f"Modification time: {modified_time} -> {readable_time}")
# output: Modification time: 1652123144.9999998 -> Mon May 9 14:05:44 2022