refactor: improve variable naming and streamline placeholder extraction logic in PDF processing

This commit is contained in:
Catalin Pit
2025-11-04 14:18:01 +02:00
parent 13d9ca7a0e
commit a08a77e98b

View File

@ -166,12 +166,9 @@ export const extractPlaceholdersFromPDF = async (pdf: Buffer): Promise<Placehold
Page dimensions from PDF2JSON are in "page units" (relative coordinates) Page dimensions from PDF2JSON are in "page units" (relative coordinates)
*/ */
const pageWidth = page.Width;
const pageHeight = page.Height;
let pageText = ''; let pageText = '';
const textPositions: TextPosition[] = []; const textPositions: TextPosition[] = [];
const charIndexToTextPos: CharIndexMapping[] = []; const charIndexMappings: CharIndexMapping[] = [];
page.Texts.forEach((text) => { page.Texts.forEach((text) => {
/* /*
@ -187,7 +184,7 @@ export const extractPlaceholdersFromPDF = async (pdf: Buffer): Promise<Placehold
This allows us to quickly find the position of a character in the textPositions array by its index. This allows us to quickly find the position of a character in the textPositions array by its index.
*/ */
for (let i = 0; i < decodedText.length; i++) { for (let i = 0; i < decodedText.length; i++) {
charIndexToTextPos.push({ charIndexMappings.push({
textPositionIndex: textPositions.length, textPositionIndex: textPositions.length,
}); });
} }
@ -216,11 +213,13 @@ export const extractPlaceholdersFromPDF = async (pdf: Buffer): Promise<Placehold
*/ */
for (const placeholderMatch of placeholderMatches) { for (const placeholderMatch of placeholderMatches) {
const placeholder = placeholderMatch[0]; const placeholder = placeholderMatch[0];
const placeholderData = placeholderMatch[1].split(',').map((part) => part.trim()); const placeholderData = placeholderMatch[1].split(',').map((property) => property.trim());
const [fieldTypeString, recipient, ...fieldMetaData] = placeholderData; const [fieldTypeString, recipient, ...fieldMetaData] = placeholderData;
const rawFieldMeta = Object.fromEntries(fieldMetaData.map((meta) => meta.split('='))); const rawFieldMeta = Object.fromEntries(
fieldMetaData.map((property) => property.split('=')),
);
const fieldType = parseFieldType(fieldTypeString); const fieldType = parseFieldType(fieldTypeString);
const parsedFieldMeta = parseFieldMeta(rawFieldMeta, fieldType); const parsedFieldMeta = parseFieldMeta(rawFieldMeta, fieldType);
@ -231,10 +230,10 @@ export const extractPlaceholdersFromPDF = async (pdf: Buffer): Promise<Placehold
}); });
/* /*
Find the position of where the placeholder starts in the text Find the position of where the placeholder starts and ends in the text.
Then find the position of where the placeholder ends in the text Then find the position of the characters in the textPositions array.
by adding the length of the placeholder to the index of the placeholder. This allows us to quickly find the position of a character in the textPositions array by its index.
*/ */
if (placeholderMatch.index === undefined) { if (placeholderMatch.index === undefined) {
console.error('Placeholder match index is undefined for placeholder', placeholder); console.error('Placeholder match index is undefined for placeholder', placeholder);
@ -242,41 +241,42 @@ export const extractPlaceholdersFromPDF = async (pdf: Buffer): Promise<Placehold
continue; continue;
} }
const placeholderLength = placeholder.length; const placeholderEndCharIndex = placeholderMatch.index + placeholder.length;
const placeholderEndIndex = placeholderMatch.index + placeholderLength;
const startCharacterIndex = charIndexToTextPos[placeholderMatch.index]; /*
const endCharacterIndex = charIndexToTextPos[placeholderEndIndex - 1]; Get the index of the placeholder's first and last character in the textPositions array.
Used to retrieve the character information from the textPositions array.
if (!startCharacterIndex || !endCharacterIndex) { Example:
console.error('Could not find text position for placeholder', placeholder); startTextPosIndex - 1
endTextPosIndex - 40
return;
}
const startTextPos = textPositions[startCharacterIndex.textPositionIndex];
const endTextPos = textPositions[endCharacterIndex.textPositionIndex];
/*
PDF2JSON coordinates - these are in "page units" (relative coordinates)
Calculate width as the distance from start to end, plus a portion of the last character's width
Use 10% of the last character width to avoid extending too far beyond the placeholder
*/ */
const x = startTextPos.x; const startTextPosIndex = charIndexMappings[placeholderMatch.index].textPositionIndex;
const y = startTextPos.y; const endTextPosIndex = charIndexMappings[placeholderEndCharIndex - 1].textPositionIndex;
const width = endTextPos.x + endTextPos.w * 0.1 - startTextPos.x;
/*
Get the placeholder's first and last character information from the textPositions array.
Example:
placeholderStart = { text: '{', x: 100, y: 100, w: 100 }
placeholderEnd = { text: '}', x: 200, y: 100, w: 100 }
*/
const placeholderStart = textPositions[startTextPosIndex];
const placeholderEnd = textPositions[endTextPosIndex];
const width = placeholderEnd.x + placeholderEnd.w * 0.1 - placeholderStart.x;
placeholders.push({ placeholders.push({
placeholder, placeholder,
recipient, recipient,
fieldAndMeta, fieldAndMeta,
page: pageIndex + 1, page: pageIndex + 1,
x, x: placeholderStart.x,
y, y: placeholderStart.y,
width, width,
height: 1, height: 1,
pageWidth, pageWidth: page.Width,
pageHeight, pageHeight: page.Height,
}); });
} }
}); });
@ -311,12 +311,6 @@ export const replacePlaceholdersInPDF = async (pdf: Buffer): Promise<Buffer> =>
- Need to convert from page units to points - Need to convert from page units to points
- Y-axis in pdf-lib is bottom-up (origin at bottom-left) - Y-axis in pdf-lib is bottom-up (origin at bottom-left)
- Y-axis in PDF2JSON is top-down (origin at top-left) - Y-axis in PDF2JSON is top-down (origin at top-left)
Conversion formulas:
- x_points = (x / pageWidth) * pdfLibPageWidth
- y_points = pdfLibPageHeight - ((y / pageHeight) * pdfLibPageHeight)
- width_points = (width / pageWidth) * pdfLibPageWidth
- height_points = (height / pageHeight) * pdfLibPageHeight
*/ */
const xPoints = (placeholder.x / placeholder.pageWidth) * pdfLibPageWidth; const xPoints = (placeholder.x / placeholder.pageWidth) * pdfLibPageWidth;
@ -432,9 +426,9 @@ export const insertFieldsFromPlaceholdersInPDF = async (
}, },
}); });
const existingEmails = existingRecipients.map((r) => r.email); const existingEmails = new Set(existingRecipients.map((r) => r.email));
const recipientsToCreateFiltered = recipientsToCreate.filter( const recipientsToCreateFiltered = recipientsToCreate.filter(
(recipient) => !existingEmails.includes(recipient.email), (recipient) => !existingEmails.has(recipient.email),
); );
let createdRecipients: Pick<Recipient, 'id' | 'email'>[] = existingRecipients; let createdRecipients: Pick<Recipient, 'id' | 'email'>[] = existingRecipients;