1. import subprocess
  2. from pathlib import Path
  3. from pikepdf import (
  4. Page,
  5. Operator,
  6. Pdf,
  7. Stream,
  8. parse_content_stream,
  9. unparse_content_stream,
  10. )
  11. # input file
  12. infile: Path = Path.cwd().joinpath("rtl8196e-vex-cg-datasheet-1.1.pdf")
  13. # object matcher
  14. wmark_colours = (0.8, 0.8, 0.8)
  15. # intermediary qdf file
  16. qdf_file: Path = infile.with_suffix(".qdf")
  17. # output file
  18. outfile: Path = infile.with_stem(f"{infile.stem}.clean")
  19. # function to strip watermark text from a page
  20. def strip_watermark_text(pdf: Pdf, page: Page, watermark_colors: tuple):
  21. stream = []
  22. in_text_obj = False
  23. color_values = (0.0, 0.0, 0.0)
  24. text_objects = []
  25. for operands, operator in parse_content_stream(page, ""):
  26. if not in_text_obj:
  27. if operator == Operator("BT"):
  28. in_text_obj = True
  29. color_values = (0.0, 0.0, 0.0)
  30. text_objects.append((operands, operator))
  31. else:
  32. stream.append((operands, operator))
  33. else:
  34. if operator == Operator("rg"):
  35. color_values = (float(operands[0]), float(operands[1]), float(operands[2]))
  36. text_objects.append((operands, operator))
  37. if operator == Operator("ET"):
  38. in_text_obj = False
  39. if color_values != watermark_colors:
  40. stream.extend(text_objects)
  41. text_objects.clear()
  42. content_stream = unparse_content_stream(stream)
  43. page.Contents = Stream(pdf, content_stream)
  44. if __name__ == "__main__":
  45. # Convert to QDF
  46. # qpdf --qdf --stream-data=uncompress infile outfile
  47. print("Converting PDF to QDF...")
  48. result = subprocess.call(["qpdf", "--qdf", "--stream-data=uncompress", infile, qdf_file])
  49. if result != 0:
  50. raise RuntimeError("qpdf failed parsing the PDF file to QDF")
  51. # open the QDF as a pikepdf object
  52. print("Opening QDF file as pikepdf object...")
  53. document: Pdf = Pdf.open(qdf_file)
  54. print("Processing pages...")
  55. for page in document.pages:
  56. print(f"{page.index}", end=" ")
  57. strip_watermark_text(document, page, wmark_colours)
  58. print(f"Saving to {outfile} ...")
  59. document.save(outfile)
  60. print(f"Removing {qdf_file} ...")
  61. qdf_file.unlink()
  62. print("Done! Hopefully...")