Really weird large redaction black boxes

lstefan1520 · November 11, 2025, 6:41pm

Hi all,

I am trying to write redaction for PDF documents. For the vast majority of redactions I am completing, the PDF document redacts properly, however I am getting some weird large boxes that pop up:
image.png (631 Bytes)

This is my current function to redact:

def _rect_coords(r) -> Tuple[float, float, float, float]:
    llx = getattr(r, "llx", getattr(r, "lower_left_x", 0.0))
    lly = getattr(r, "lly", getattr(r, "lower_left_y", 0.0))
    urx = getattr(r, "urx", getattr(r, "upper_right_x", 0.0))
    ury = getattr(r, "ury", getattr(r, "upper_right_y", 0.0))
    return float(llx), float(lly), float(urx), float(ury)

def _mk_rect(llx: float, lly: float, urx: float, ury: float) -> pdf.Rectangle:
    x1, y1 = float(llx), float(lly)
    x2, y2 = float(urx), float(ury)
    llx_n, lly_n = min(x1, x2), min(y1, y2)
    urx_n, ury_n = max(x1, x2), max(y1, y2)
    try:
        return pdf.Rectangle(llx_n, lly_n, urx_n, ury_n, True)  # some builds need 5th arg
    except TypeError:
        return pdf.Rectangle(llx_n, lly_n, urx_n, ury_n)

def _union_rect(a: pdf.Rectangle, b: pdf.Rectangle) -> pdf.Rectangle:
    ax1, ay1, ax2, ay2 = _rect_coords(a)
    bx1, by1, bx2, by2 = _rect_coords(b)
    return _mk_rect(min(ax1, bx1), min(ay1, by1), max(ax2, bx2), max(ay2, by2))

def _clip_height(r: pdf.Rectangle, frac: float = 0.60) -> pdf.Rectangle:
    """Reduce vertical height to avoid full-line banners when we lack fine geometry."""
    x1, y1, x2, y2 = _rect_coords(r)
    h = max(0.001, y2 - y1)
    band = max(0.5, h * frac)
    mid = (y1 + y2) / 2.0
    return _mk_rect(x1, mid - band/2.0, x2, mid + band/2.0)

def _inset(r: pdf.Rectangle, dx=0.2, dy=0.1) -> pdf.Rectangle:
    x1, y1, x2, y2 = _rect_coords(r)
    return _mk_rect(x1 + dx, y1 + dy, x2 - dx, y2 - dy)

# ---------- tight rect for substring (skip whitespace glyphs) ----------

def _tight_rect_for_span(frag: "pdf.text.TextFragment", start: int, length: int) -> pdf.Rectangle:
    """
    Tight rect for [start, start+length) inside frag.text:
      1) Union of *character* rectangles within span, skipping whitespace chars.
      2) Else union of overlapping segment rectangles, then clip height + small inset.
      3) Else proportional slice inside fragment, then clip height + inset.
    """
    end = start + length
    ftxt = frag.text or ""
    fr = frag.rectangle
    fx1, fy1, fx2, fy2 = _rect_coords(fr)

    # 1) CHAR-LEVEL (best): union only non-space glyphs
    segs = list(getattr(frag, "segments", []))
    if segs:
        rect_chars = None
        idx = 0
        for seg in segs:
            s = seg.text or ""
            seg_start, seg_end = idx, idx + len(s)
            idx = seg_end
            if seg_end <= start or seg_start >= end:
                continue

            chars = list(getattr(seg, "characters", []))
            if chars:
                for i, ch in enumerate(chars):
                    ci = seg_start + i  # char index in fragment
                    if start <= ci < end:
                        # Skip whitespace glyphs entirely
                        ch_text = getattr(ch, "text", None)
                        if ch_text is not None and ch_text.strip() == "":
                            continue
                        cr = getattr(ch, "rectangle", None)
                        if cr:
                            rect_chars = cr if rect_chars is None else _union_rect(rect_chars, cr)

        if rect_chars is not None:
            return _inset(rect_chars, dx=0.1, dy=0.05)

    # 2) SEGMENT-LEVEL
    rect_segs = None
    if segs:
        idx = 0
        for seg in segs:
            s = seg.text or ""
            seg_start, seg_end = idx, idx + len(s)
            idx = seg_end
            if seg_end <= start or seg_start >= end:
                continue
            sr = getattr(seg, "rectangle", None) or fr
            rect_segs = sr if rect_segs is None else _union_rect(rect_segs, sr)

        if rect_segs is not None:
            rect_segs = _clip_height(rect_segs, frac=0.55)  # a bit tighter vertically
            return _inset(rect_segs, dx=0.6, dy=0.08)       # and slightly narrower

    # 3) PROPORTIONAL SLICE
    L = max(1, len(ftxt))
    w = max(0.001, fx2 - fx1)
    x_start = fx1 + (start / L) * w
    x_end   = fx1 + (end   / L) * w
    tight = _mk_rect(min(x_start, x_end), fy1, max(x_start, x_end), fy2)
    tight = _clip_height(tight, frac=0.55)
    return _inset(tight, dx=0.6, dy=0.08)

# ---------- main entry (same signature) ----------

def redact_literals_on_pages(doc: "pdf.Document", patterns: List[str], page_numbers: List[int]) -> int:
    total = 0
    if not patterns or not page_numbers:
        return 0

    compiled = [(pat, re.compile(pat)) for pat in patterns if pat]

    for pat_str, py_rx in compiled:
        for pnum in page_numbers:
            page = doc.pages[pnum]
            absorber = pdf.text.TextFragmentAbsorber(pat_str)
            absorber.text_search_options = pdf.text.TextSearchOptions(True)  # regex ON
            page.accept(absorber)

            for frag in absorber.text_fragments:
                ftxt = frag.text or ""
                for m in py_rx.finditer(ftxt):
                    rect = _tight_rect_for_span(frag, m.start(), m.end() - m.start())
                    ann = pdf.annotations.RedactionAnnotation(page, rect)
                    ann.fill_color = pdf.Color.black
                    page.annotations.add(ann, True)
                    try:
                        ann.redact()
                    except TypeError:
                        try:
                            ann.redact(True)
                        except TypeError:
                            pass
                    total += 1
    return total

I have tried multiple different ways to redact, and they all seem to pop up the same large box issues.

Has anyone had a similar issue? Thank you!

Professionalize.Discourse · November 11, 2025, 6:42pm

@lstefan1520

Summary
You are seeing very large black rectangles when you apply redaction annotations. The most common reasons are that the rectangle supplied to the RedactionAnnotation does not match the page’s coordinate system (e.g., it is too large, shifted, or not transformed for page rotation) or that the SDK version you are using has a bug in how it interprets those rectangles.

Why it happens

Possible cause	What it means for your code
Rectangle coordinates are outside the page bounds – The coordinates returned by `TextFragment.rectangle` (or by character‑level rectangles) are in user space (origin at the lower‑left of the page). If the page is rotated, has a non‑standard media box, or the values are not transformed, the annotation can cover the whole page or a very large area.
SDK version issue – Some older releases of Aspose PDF for Python have been reported to mis‑interpret redaction rectangles on certain pages. Upgrading often fixes the problem.
Whitespace‑only matches – When the matched text consists only of spaces/tabs, the calculated rectangle can become extremely wide, producing a large black box.
Calling `ann.redact()` per annotation – This only creates the visual appearance. The actual content removal is performed later (e.g., `doc.redact()` or during `doc.save`). If the appearance is generated with an incorrect rectangle, the oversized box will be shown.

What to do next

Check your Aspose PDF for Python version and upgrade if it is not the latest stable build.

import aspose.pdf as pdf
print(pdf.__version__) # e.g. 25.10.0

Upgrade with pip install -U aspose-pdf if needed.

Log the rectangle values for the fragments that produce the large boxes. Insert a debug print just before creating the annotation:

rect = _tight_rect_for_span(frag, m.start(), m.end() - m.start())
print(f"Page {pnum}: rect = ({rect.llx}, {rect.lly}, {rect.urx}, {rect.ury})")
print(f"Page size: {page.page_info.width} x {page.page_info.height}, rotation={page.rotation}")

Verify that 0 ≤ llx, urx ≤ width and 0 ≤ lly, ury ≤ height. If any value is outside these limits, adjust the calculation.

Handle page rotation or media‑box offsets if your PDFs are rotated. A simple helper for a 90° clockwise rotation:

def transform_rect(rect, page):
if page.rotation == 90:
llx, lly, urx, ury = rect.llx, rect.lly, rect.urx, rect.ury
width = page.page_info.width
# swap X/Y and shift origin
return pdf.Rectangle(lly, width - urx, ury, width - llx)
# add other rotation cases as needed
return rect

Use it before creating the annotation:

rect = transform_rect(rect, page)
ann = pdf.annotations.RedactionAnnotation(page, rect)

Add all redaction annotations first, then run a document‑wide redaction instead of calling ann.redact() for each annotation:

ann = pdf.annotations.RedactionAnnotation(page, rect)
ann.fill_color = pdf.Color.black
page.annotations.add(ann) # do NOT call ann.redact() here

After processing all pages:

doc.redact() # performs the actual redaction pass
doc.save("output.pdf")

Skip whitespace‑only matches to avoid creating huge rectangles for spaces:

if frag.text[m.start():m.end()].strip() == "":
continue # nothing to redact

If the problem persists, gather a minimal reproducible example and share it with Aspose support, including:

Exact Aspose PDF for Python version.
A short PDF (single‑page) that reproduces the large black rectangle.
The console output from step 2 showing the rectangle coordinates.

Next steps
If after applying the above checks the rectangles still appear oversized and the logged coordinates look correct, please provide the sample data to the product team for deeper investigation. The issue may be a regression in the SDK.

Note: The evidence available does not contain a direct reference to this specific redaction‑box problem, so the above guidance is based on common causes and best‑practice debugging steps for Aspose PDF redaction.

I could not fully verify this explanation against the retrieved sources. Please double-check the behaviour in your environment and confirm the exact SDK/version.

lstefan1520 · November 18, 2025, 4:55pm

I am running aspose-pdf==25.10.0

My current code has implemented all of the changes requested, and I am still seeing the same issue.

It is specific to the type of PDF I am trying to redact. I used aspose to extract the text of the PDF to see if it was an issue with reading the PDF, and I believe it is…

On the left is an example of a PDF that does the redactions properly, on the right is a PDF where there is a weird stuff happening when I extract the PDF text:
image.png (25.2 KB)

When I attempt to redact the PDF within adobe, it does it perfectly fine:
image.png (3.9 KB)

The output console of the redaction is as follows:

--- Processing page 1 ---
[DEBUG] Original rect: (382.43, 6.59, 469.24, 70.29)
[DEBUG] Page dimensions: 595.00 x 842.00
[DEBUG] Page 1: Redacted ''(REMOVED FOR PPI)...'
[DEBUG] Original rect: (318.78, 224.27, 369.05, 287.97)
[DEBUG] Page dimensions: 595.00 x 842.00
[DEBUG] Page 1: Redacted '(REMOVED FOR PPI)...'
[DEBUG] Original rect: (497.80, 713.39, 560.28, 777.09)
[DEBUG] Page dimensions: 595.00 x 842.00
[DEBUG] Page 1: Redacted ''(REMOVED FOR PPI)...'
[DEBUG]

If you would like a copy of the PDF to test, please let me know.

Anastasia_Radtsevich · November 19, 2025, 5:59am

@lstefan1520 Yes, we need the input PDF file to test the issue. Could you please share it? Thank you.