import subprocess from pathlib import Path from pikepdf import ( Page, Operator, Pdf, Stream, parse_content_stream, unparse_content_stream, ) # input file infile: Path = Path.cwd().joinpath("rtl8196e-vex-cg-datasheet-1.1.pdf") # object matcher wmark_colours = (0.8, 0.8, 0.8) # intermediary qdf file qdf_file: Path = infile.with_suffix(".qdf") # output file outfile: Path = infile.with_stem(f"{infile.stem}.clean") # function to strip watermark text from a page def strip_watermark_text(pdf: Pdf, page: Page, watermark_colors: tuple): stream = [] in_text_obj = False color_values = (0.0, 0.0, 0.0) text_objects = [] for operands, operator in parse_content_stream(page, ""): if not in_text_obj: if operator == Operator("BT"): in_text_obj = True color_values = (0.0, 0.0, 0.0) text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == Operator("rg"): color_values = (float(operands[0]), float(operands[1]), float(operands[2])) text_objects.append((operands, operator)) if operator == Operator("ET"): in_text_obj = False if color_values != watermark_colors: stream.extend(text_objects) text_objects.clear() content_stream = unparse_content_stream(stream) page.Contents = Stream(pdf, content_stream) if __name__ == "__main__": # Convert to QDF # qpdf --qdf --stream-data=uncompress infile outfile print("Converting PDF to QDF...") result = subprocess.call(["qpdf", "--qdf", "--stream-data=uncompress", infile, qdf_file]) if result != 0: raise RuntimeError("qpdf failed parsing the PDF file to QDF") # open the QDF as a pikepdf object print("Opening QDF file as pikepdf object...") document: Pdf = Pdf.open(qdf_file) print("Processing pages...") for page in document.pages: print(f"{page.index}", end=" ") strip_watermark_text(document, page, wmark_colours) print(f"Saving to {outfile} ...") document.save(outfile) print(f"Removing {qdf_file} ...") qdf_file.unlink() print("Done! Hopefully...")