Hi all,
I am trying to write redaction for PDF documents. For the vast majority of redactions I am completing, the PDF document redacts properly, however I am getting some weird large boxes that pop up:
image.png (631 Bytes)
This is my current function to redact:
def _rect_coords(r) -> Tuple[float, float, float, float]:
llx = getattr(r, "llx", getattr(r, "lower_left_x", 0.0))
lly = getattr(r, "lly", getattr(r, "lower_left_y", 0.0))
urx = getattr(r, "urx", getattr(r, "upper_right_x", 0.0))
ury = getattr(r, "ury", getattr(r, "upper_right_y", 0.0))
return float(llx), float(lly), float(urx), float(ury)
def _mk_rect(llx: float, lly: float, urx: float, ury: float) -> pdf.Rectangle:
x1, y1 = float(llx), float(lly)
x2, y2 = float(urx), float(ury)
llx_n, lly_n = min(x1, x2), min(y1, y2)
urx_n, ury_n = max(x1, x2), max(y1, y2)
try:
return pdf.Rectangle(llx_n, lly_n, urx_n, ury_n, True) # some builds need 5th arg
except TypeError:
return pdf.Rectangle(llx_n, lly_n, urx_n, ury_n)
def _union_rect(a: pdf.Rectangle, b: pdf.Rectangle) -> pdf.Rectangle:
ax1, ay1, ax2, ay2 = _rect_coords(a)
bx1, by1, bx2, by2 = _rect_coords(b)
return _mk_rect(min(ax1, bx1), min(ay1, by1), max(ax2, bx2), max(ay2, by2))
def _clip_height(r: pdf.Rectangle, frac: float = 0.60) -> pdf.Rectangle:
"""Reduce vertical height to avoid full-line banners when we lack fine geometry."""
x1, y1, x2, y2 = _rect_coords(r)
h = max(0.001, y2 - y1)
band = max(0.5, h * frac)
mid = (y1 + y2) / 2.0
return _mk_rect(x1, mid - band/2.0, x2, mid + band/2.0)
def _inset(r: pdf.Rectangle, dx=0.2, dy=0.1) -> pdf.Rectangle:
x1, y1, x2, y2 = _rect_coords(r)
return _mk_rect(x1 + dx, y1 + dy, x2 - dx, y2 - dy)
# ---------- tight rect for substring (skip whitespace glyphs) ----------
def _tight_rect_for_span(frag: "pdf.text.TextFragment", start: int, length: int) -> pdf.Rectangle:
"""
Tight rect for [start, start+length) inside frag.text:
1) Union of *character* rectangles within span, skipping whitespace chars.
2) Else union of overlapping segment rectangles, then clip height + small inset.
3) Else proportional slice inside fragment, then clip height + inset.
"""
end = start + length
ftxt = frag.text or ""
fr = frag.rectangle
fx1, fy1, fx2, fy2 = _rect_coords(fr)
# 1) CHAR-LEVEL (best): union only non-space glyphs
segs = list(getattr(frag, "segments", []))
if segs:
rect_chars = None
idx = 0
for seg in segs:
s = seg.text or ""
seg_start, seg_end = idx, idx + len(s)
idx = seg_end
if seg_end <= start or seg_start >= end:
continue
chars = list(getattr(seg, "characters", []))
if chars:
for i, ch in enumerate(chars):
ci = seg_start + i # char index in fragment
if start <= ci < end:
# Skip whitespace glyphs entirely
ch_text = getattr(ch, "text", None)
if ch_text is not None and ch_text.strip() == "":
continue
cr = getattr(ch, "rectangle", None)
if cr:
rect_chars = cr if rect_chars is None else _union_rect(rect_chars, cr)
if rect_chars is not None:
return _inset(rect_chars, dx=0.1, dy=0.05)
# 2) SEGMENT-LEVEL
rect_segs = None
if segs:
idx = 0
for seg in segs:
s = seg.text or ""
seg_start, seg_end = idx, idx + len(s)
idx = seg_end
if seg_end <= start or seg_start >= end:
continue
sr = getattr(seg, "rectangle", None) or fr
rect_segs = sr if rect_segs is None else _union_rect(rect_segs, sr)
if rect_segs is not None:
rect_segs = _clip_height(rect_segs, frac=0.55) # a bit tighter vertically
return _inset(rect_segs, dx=0.6, dy=0.08) # and slightly narrower
# 3) PROPORTIONAL SLICE
L = max(1, len(ftxt))
w = max(0.001, fx2 - fx1)
x_start = fx1 + (start / L) * w
x_end = fx1 + (end / L) * w
tight = _mk_rect(min(x_start, x_end), fy1, max(x_start, x_end), fy2)
tight = _clip_height(tight, frac=0.55)
return _inset(tight, dx=0.6, dy=0.08)
# ---------- main entry (same signature) ----------
def redact_literals_on_pages(doc: "pdf.Document", patterns: List[str], page_numbers: List[int]) -> int:
total = 0
if not patterns or not page_numbers:
return 0
compiled = [(pat, re.compile(pat)) for pat in patterns if pat]
for pat_str, py_rx in compiled:
for pnum in page_numbers:
page = doc.pages[pnum]
absorber = pdf.text.TextFragmentAbsorber(pat_str)
absorber.text_search_options = pdf.text.TextSearchOptions(True) # regex ON
page.accept(absorber)
for frag in absorber.text_fragments:
ftxt = frag.text or ""
for m in py_rx.finditer(ftxt):
rect = _tight_rect_for_span(frag, m.start(), m.end() - m.start())
ann = pdf.annotations.RedactionAnnotation(page, rect)
ann.fill_color = pdf.Color.black
page.annotations.add(ann, True)
try:
ann.redact()
except TypeError:
try:
ann.redact(True)
except TypeError:
pass
total += 1
return total
I have tried multiple different ways to redact, and they all seem to pop up the same large box issues.
Has anyone had a similar issue? Thank you!