refactor: improve variable naming and streamline placeholder extraction logic in PDF processing

chore: some cleanup
2025-11-13 08:13:56 +10:00 · 2025-11-04 14:18:01 +02:00 · 2025-11-04 11:16:53 +02:00
1 changed files with 67 additions and 61 deletions
--- a/packages/lib/server-only/pdf/auto-place-fields.ts
+++ b/packages/lib/server-only/pdf/auto-place-fields.ts
@ -25,7 +25,7 @@ type TextPosition = {
 };

 type CharIndexMapping = {
-  textPosIndex: number;
+  textPositionIndex: number;
 };

 type PlaceholderInfo = {
@ -121,28 +121,26 @@ const parseFieldMeta = (
    rawFieldMeta is an object with string keys and string values.
    It contains string values because the PDF parser returns the values as strings.

-    E.g. { required: 'true', fontSize: '12', maxValue: '100', minValue: '0', characterLimit: '100' }
+    E.g. { 'required': 'true', 'fontSize': '12', 'maxValue': '100', 'minValue': '0', 'characterLimit': '100' }
  */
  const rawFieldMetaEntries = Object.entries(rawFieldMeta);

-  for (const entry of rawFieldMetaEntries) {
-    const [key, value] = entry;
-
-    if (key === 'readOnly' || key === 'required') {
-      parsedFieldMeta[key] = value === 'true';
+  for (const [property, value] of rawFieldMetaEntries) {
+    if (property === 'readOnly' || property === 'required') {
+      parsedFieldMeta[property] = value === 'true';
    } else if (
-      key === 'fontSize' ||
-      key === 'maxValue' ||
-      key === 'minValue' ||
-      key === 'characterLimit'
+      property === 'fontSize' ||
+      property === 'maxValue' ||
+      property === 'minValue' ||
+      property === 'characterLimit'
    ) {
      const numValue = Number(value);

      if (!Number.isNaN(numValue)) {
-        parsedFieldMeta[key] = numValue;
+        parsedFieldMeta[property] = numValue;
      }
    } else {
-      parsedFieldMeta[key] = value;
+      parsedFieldMeta[property] = value;
    }
  }

@ -168,25 +166,26 @@ export const extractPlaceholdersFromPDF = async (pdf: Buffer): Promise<Placehold
          
          Page dimensions from PDF2JSON are in "page units" (relative coordinates)
        */
-        const pageWidth = page.Width;
-        const pageHeight = page.Height;
-
        let pageText = '';
        const textPositions: TextPosition[] = [];
-        const charIndexToTextPos: CharIndexMapping[] = [];
+        const charIndexMappings: CharIndexMapping[] = [];

        page.Texts.forEach((text) => {
          /*
-            R is an array that contains objects with each character.
-            The decodedText contains only the character, without any other information.
+            R is an array of objects containing each character, its position and styling information.
+            The decodedText stores the characters, without any other information.

            textPositions stores each character and its position on the page.
          */
          const decodedText = text.R.map((run) => decodeURIComponent(run.T)).join('');

+          /*
+            For each character in the decodedText, we store its position in the textPositions array.
+            This allows us to quickly find the position of a character in the textPositions array by its index.
+          */
          for (let i = 0; i < decodedText.length; i++) {
-            charIndexToTextPos.push({
-              textPosIndex: textPositions.length,
+            charIndexMappings.push({
+              textPositionIndex: textPositions.length,
            });
          }

@ -202,13 +201,25 @@ export const extractPlaceholdersFromPDF = async (pdf: Buffer): Promise<Placehold

        const placeholderMatches = pageText.matchAll(/{{([^}]+)}}/g);

+        /*
+          A placeholder match has the following format:
+
+          [
+            '{{fieldType,recipient,fieldMeta}}',
+            'fieldType,recipient,fieldMeta',
+            'index: <number>',
+            'input: <pdf-text>'
+          ]
+        */
        for (const placeholderMatch of placeholderMatches) {
          const placeholder = placeholderMatch[0];
-          const placeholderData = placeholderMatch[1].split(',').map((part) => part.trim());
+          const placeholderData = placeholderMatch[1].split(',').map((property) => property.trim());

          const [fieldTypeString, recipient, ...fieldMetaData] = placeholderData;

-          const rawFieldMeta = Object.fromEntries(fieldMetaData.map((meta) => meta.split('=')));
+          const rawFieldMeta = Object.fromEntries(
+            fieldMetaData.map((property) => property.split('=')),
+          );

          const fieldType = parseFieldType(fieldTypeString);
          const parsedFieldMeta = parseFieldMeta(rawFieldMeta, fieldType);
@ -219,51 +230,53 @@ export const extractPlaceholdersFromPDF = async (pdf: Buffer): Promise<Placehold
          });

          /*
-            Find the position of where the placeholder starts in the text
+            Find the position of where the placeholder starts and ends in the text.

-            Then find the position of where the placeholder ends in the text by adding the length of the placeholder to the index of the placeholder.
+            Then find the position of the characters in the textPositions array.
+            This allows us to quickly find the position of a character in the textPositions array by its index.
          */
-          const matchIndex = placeholderMatch.index;
+          if (placeholderMatch.index === undefined) {
+            console.error('Placeholder match index is undefined for placeholder', placeholder);

-          if (matchIndex === undefined) {
            continue;
          }

-          const placeholderLength = placeholder.length;
-          const placeholderEndIndex = matchIndex + placeholderLength;
-
-          const startCharInfo = charIndexToTextPos[matchIndex];
-          const endCharInfo = charIndexToTextPos[placeholderEndIndex - 1];
-
-          if (!startCharInfo || !endCharInfo) {
-            console.error('Could not find text position for placeholder', placeholder);
-
-            return;
-          }
-
-          const startTextPos = textPositions[startCharInfo.textPosIndex];
-          const endTextPos = textPositions[endCharInfo.textPosIndex];
+          const placeholderEndCharIndex = placeholderMatch.index + placeholder.length;

          /*
-            PDF2JSON coordinates - these are in "page units" (relative coordinates)
-            Calculate width as the distance from start to end, plus a portion of the last character's width
-            Use 10% of the last character width to avoid extending too far beyond the placeholder
+            Get the index of the placeholder's first and last character in the textPositions array.
+            Used to retrieve the character information from the textPositions array.
+
+            Example:
+              startTextPosIndex - 1
+              endTextPosIndex - 40
          */
-          const x = startTextPos.x;
-          const y = startTextPos.y;
-          const width = endTextPos.x + endTextPos.w * 0.1 - startTextPos.x;
+          const startTextPosIndex = charIndexMappings[placeholderMatch.index].textPositionIndex;
+          const endTextPosIndex = charIndexMappings[placeholderEndCharIndex - 1].textPositionIndex;
+
+          /*
+            Get the placeholder's first and last character information from the textPositions array.
+
+            Example:
+              placeholderStart = { text: '{', x: 100, y: 100, w: 100 }
+              placeholderEnd = { text: '}', x: 200, y: 100, w: 100 }
+          */
+          const placeholderStart = textPositions[startTextPosIndex];
+          const placeholderEnd = textPositions[endTextPosIndex];
+
+          const width = placeholderEnd.x + placeholderEnd.w * 0.1 - placeholderStart.x;

          placeholders.push({
            placeholder,
            recipient,
            fieldAndMeta,
            page: pageIndex + 1,
-            x,
-            y,
+            x: placeholderStart.x,
+            y: placeholderStart.y,
            width,
            height: 1,
-            pageWidth,
-            pageHeight,
+            pageWidth: page.Width,
+            pageHeight: page.Height,
          });
        }
      });
@ -298,12 +311,6 @@ export const replacePlaceholdersInPDF = async (pdf: Buffer): Promise<Buffer> =>
      - Need to convert from page units to points
      - Y-axis in pdf-lib is bottom-up (origin at bottom-left)
      - Y-axis in PDF2JSON is top-down (origin at top-left)
-      
-      Conversion formulas:
-      - x_points = (x / pageWidth) * pdfLibPageWidth
-      - y_points = pdfLibPageHeight - ((y / pageHeight) * pdfLibPageHeight)
-      - width_points = (width / pageWidth) * pdfLibPageWidth
-      - height_points = (height / pageHeight) * pdfLibPageHeight
    */

    const xPoints = (placeholder.x / placeholder.pageWidth) * pdfLibPageWidth;
@ -419,9 +426,9 @@ export const insertFieldsFromPlaceholdersInPDF = async (
    },
  });

-  const existingEmails = new Set(existingRecipients.map((r) => r.email.toLowerCase()));
+  const existingEmails = new Set(existingRecipients.map((r) => r.email));
  const recipientsToCreateFiltered = recipientsToCreate.filter(
-    (r) => !existingEmails.has(r.email.toLowerCase()),
+    (recipient) => !existingEmails.has(recipient.email),
  );

  let createdRecipients: Pick<Recipient, 'id' | 'email'>[] = existingRecipients;
@ -473,8 +480,7 @@ export const insertFieldsFromPlaceholdersInPDF = async (
    const heightPercent = (placeholder.height / placeholder.pageHeight) * 100;

    const { email } = extractRecipientPlaceholder(placeholder.recipient);
-    const normalizedEmail = email.toLowerCase();
-    const recipient = createdRecipients.find((r) => r.email.toLowerCase() === normalizedEmail);
+    const recipient = createdRecipients.find((r) => r.email === email);

    if (!recipient) {
      throw new AppError(AppErrorCode.INVALID_BODY, {
Author	SHA1	Message	Date
Catalin Pit	a08a77e98b	refactor: improve variable naming and streamline placeholder extraction logic in PDF processing	2025-11-04 14:18:01 +02:00
Catalin Pit	13d9ca7a0e	chore: some cleanup	2025-11-04 11:16:53 +02:00