- import subprocess
- from pathlib import Path
-
- from pikepdf import (
- Page,
- Operator,
- Pdf,
- Stream,
- parse_content_stream,
- unparse_content_stream,
- )
-
- # input file
- infile: Path = Path.cwd().joinpath("rtl8196e-vex-cg-datasheet-1.1.pdf")
- # object matcher
- wmark_colours = (0.8, 0.8, 0.8)
-
- # intermediary qdf file
- qdf_file: Path = infile.with_suffix(".qdf")
- # output file
- outfile: Path = infile.with_stem(f"{infile.stem}.clean")
-
- # function to strip watermark text from a page
- def strip_watermark_text(pdf: Pdf, page: Page, watermark_colors: tuple):
- stream = []
- in_text_obj = False
- color_values = (0.0, 0.0, 0.0)
- text_objects = []
-
- for operands, operator in parse_content_stream(page, ""):
- if not in_text_obj:
- if operator == Operator("BT"):
- in_text_obj = True
- color_values = (0.0, 0.0, 0.0)
- text_objects.append((operands, operator))
- else:
- stream.append((operands, operator))
- else:
- if operator == Operator("rg"):
- color_values = (float(operands[0]), float(operands[1]), float(operands[2]))
- text_objects.append((operands, operator))
- if operator == Operator("ET"):
- in_text_obj = False
- if color_values != watermark_colors:
- stream.extend(text_objects)
- text_objects.clear()
-
- content_stream = unparse_content_stream(stream)
- page.Contents = Stream(pdf, content_stream)
-
-
- if __name__ == "__main__":
- # Convert to QDF
- # qpdf --qdf --stream-data=uncompress infile outfile
- print("Converting PDF to QDF...")
- result = subprocess.call(["qpdf", "--qdf", "--stream-data=uncompress", infile, qdf_file])
- if result != 0:
- raise RuntimeError("qpdf failed parsing the PDF file to QDF")
-
- # open the QDF as a pikepdf object
- print("Opening QDF file as pikepdf object...")
- document: Pdf = Pdf.open(qdf_file)
-
- print("Processing pages...")
- for page in document.pages:
- print(f"{page.index}", end=" ")
- strip_watermark_text(document, page, wmark_colours)
-
- print(f"Saving to {outfile} ...")
- document.save(outfile)
-
- print(f"Removing {qdf_file} ...")
- qdf_file.unlink()
-
- print("Done! Hopefully...")