feat: Ultimate Edition - Modular assembly, duplicate scanner, content modules, quality flags
This commit is contained in:
224
frontend/src/pages/api/seo/assemble-article.ts
Normal file
224
frontend/src/pages/api/seo/assemble-article.ts
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
// @ts-ignore - Astro types available at build time
|
||||||
|
import type { APIRoute } from 'astro';
|
||||||
|
import { getDirectusClient, readItems, createItem, updateItem } from '@/lib/directus/client';
|
||||||
|
import { replaceYearTokens } from '@/lib/seo/velocity-scheduler';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Assemble Article API
|
||||||
|
*
|
||||||
|
* Builds a full article from content modules based on campaign recipe.
|
||||||
|
* Uses lowest usage_count modules to ensure variety.
|
||||||
|
*
|
||||||
|
* POST /api/seo/assemble-article
|
||||||
|
*/
|
||||||
|
export const POST: APIRoute = async ({ request }: { request: Request }) => {
|
||||||
|
try {
|
||||||
|
const data = await request.json();
|
||||||
|
const {
|
||||||
|
campaign_id,
|
||||||
|
location, // { city, state, county }
|
||||||
|
publish_date,
|
||||||
|
modified_date
|
||||||
|
} = data;
|
||||||
|
|
||||||
|
if (!campaign_id || !location) {
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({ error: 'campaign_id and location required' }),
|
||||||
|
{ status: 400, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const directus = getDirectusClient();
|
||||||
|
|
||||||
|
// Get campaign with recipe
|
||||||
|
const campaigns = await directus.request(readItems('campaign_masters', {
|
||||||
|
filter: { id: { _eq: campaign_id } },
|
||||||
|
limit: 1
|
||||||
|
})) as any[];
|
||||||
|
|
||||||
|
const campaign = campaigns[0];
|
||||||
|
if (!campaign) {
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({ error: 'Campaign not found' }),
|
||||||
|
{ status: 404, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const recipe = campaign.content_recipe || ['intro', 'benefits', 'howto', 'conclusion'];
|
||||||
|
const pubDate = publish_date ? new Date(publish_date) : new Date();
|
||||||
|
const modDate = modified_date ? new Date(modified_date) : new Date();
|
||||||
|
|
||||||
|
// Build context for token replacement
|
||||||
|
const context = {
|
||||||
|
city: location.city || '',
|
||||||
|
state: location.state || '',
|
||||||
|
county: location.county || '',
|
||||||
|
state_code: getStateCode(location.state) || '',
|
||||||
|
year: pubDate.getFullYear()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Fetch and assemble modules
|
||||||
|
const assembledParts: string[] = [];
|
||||||
|
const modulesUsed: string[] = [];
|
||||||
|
|
||||||
|
for (const moduleType of recipe) {
|
||||||
|
// Get modules of this type, prefer lowest usage_count
|
||||||
|
const modules = await directus.request(readItems('content_modules', {
|
||||||
|
filter: {
|
||||||
|
site: { _eq: campaign.site },
|
||||||
|
module_type: { _eq: moduleType },
|
||||||
|
is_active: { _eq: true }
|
||||||
|
},
|
||||||
|
sort: ['usage_count', 'id'], // Lowest usage first
|
||||||
|
limit: 1
|
||||||
|
})) as any[];
|
||||||
|
|
||||||
|
if (modules.length > 0) {
|
||||||
|
const module = modules[0];
|
||||||
|
|
||||||
|
// Process spintax
|
||||||
|
let content = module.content_spintax || '';
|
||||||
|
|
||||||
|
// Replace location tokens
|
||||||
|
content = content
|
||||||
|
.replace(/\{City\}/gi, context.city)
|
||||||
|
.replace(/\{State\}/gi, context.state)
|
||||||
|
.replace(/\{County\}/gi, context.county)
|
||||||
|
.replace(/\{State_Code\}/gi, context.state_code)
|
||||||
|
.replace(/\{Location_City\}/gi, context.city)
|
||||||
|
.replace(/\{Location_State\}/gi, context.state);
|
||||||
|
|
||||||
|
// Replace year tokens
|
||||||
|
content = replaceYearTokens(content, pubDate);
|
||||||
|
|
||||||
|
// Process spintax syntax
|
||||||
|
content = processSpintax(content);
|
||||||
|
|
||||||
|
assembledParts.push(content);
|
||||||
|
modulesUsed.push(module.id);
|
||||||
|
|
||||||
|
// Increment usage count
|
||||||
|
await directus.request(
|
||||||
|
updateItem('content_modules', module.id, {
|
||||||
|
usage_count: (module.usage_count || 0) + 1
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const fullContent = assembledParts.join('\n\n');
|
||||||
|
|
||||||
|
// Generate headline from intro
|
||||||
|
const headline = generateHeadline(campaign.spintax_title, context, pubDate) ||
|
||||||
|
`${context.city} ${campaign.name || 'Guide'}`;
|
||||||
|
|
||||||
|
// Generate meta
|
||||||
|
const metaTitle = headline.substring(0, 60);
|
||||||
|
const metaDescription = stripHtml(fullContent).substring(0, 155) + '...';
|
||||||
|
|
||||||
|
// Count words
|
||||||
|
const wordCount = stripHtml(fullContent).split(/\s+/).length;
|
||||||
|
|
||||||
|
// Create article
|
||||||
|
const article = await directus.request(
|
||||||
|
createItem('generated_articles', {
|
||||||
|
site: campaign.site,
|
||||||
|
campaign: campaign_id,
|
||||||
|
headline: headline,
|
||||||
|
meta_title: metaTitle,
|
||||||
|
meta_description: metaDescription,
|
||||||
|
full_html_body: fullContent,
|
||||||
|
word_count: wordCount,
|
||||||
|
is_published: false,
|
||||||
|
is_test_batch: false,
|
||||||
|
date_published: pubDate.toISOString(),
|
||||||
|
date_modified: modDate.toISOString(),
|
||||||
|
sitemap_status: 'ghost',
|
||||||
|
location_city: context.city,
|
||||||
|
location_county: context.county,
|
||||||
|
location_state: context.state,
|
||||||
|
modules_used: modulesUsed
|
||||||
|
})
|
||||||
|
) as any;
|
||||||
|
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({
|
||||||
|
success: true,
|
||||||
|
article_id: article.id,
|
||||||
|
headline,
|
||||||
|
word_count: wordCount,
|
||||||
|
modules_used: modulesUsed.length,
|
||||||
|
dates: {
|
||||||
|
published: pubDate.toISOString(),
|
||||||
|
modified: modDate.toISOString()
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error assembling article:', error);
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({ error: 'Failed to assemble article' }),
|
||||||
|
{ status: 500, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process spintax syntax: {option1|option2|option3}
|
||||||
|
*/
|
||||||
|
function processSpintax(text: string): string {
|
||||||
|
// Match nested spintax from innermost to outermost
|
||||||
|
let result = text;
|
||||||
|
let maxIterations = 100;
|
||||||
|
|
||||||
|
while (result.includes('{') && maxIterations > 0) {
|
||||||
|
result = result.replace(/\{([^{}]+)\}/g, (match, options) => {
|
||||||
|
const choices = options.split('|');
|
||||||
|
return choices[Math.floor(Math.random() * choices.length)];
|
||||||
|
});
|
||||||
|
maxIterations--;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate headline with spintax and tokens
|
||||||
|
*/
|
||||||
|
function generateHeadline(template: string | null, context: any, date: Date): string {
|
||||||
|
if (!template) return '';
|
||||||
|
|
||||||
|
let headline = template
|
||||||
|
.replace(/\{City\}/gi, context.city)
|
||||||
|
.replace(/\{State\}/gi, context.state)
|
||||||
|
.replace(/\{County\}/gi, context.county);
|
||||||
|
|
||||||
|
headline = replaceYearTokens(headline, date);
|
||||||
|
headline = processSpintax(headline);
|
||||||
|
|
||||||
|
return headline;
|
||||||
|
}
|
||||||
|
|
||||||
|
function stripHtml(html: string): string {
|
||||||
|
return html.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' ').trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function getStateCode(state: string): string {
|
||||||
|
const codes: Record<string, string> = {
|
||||||
|
'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
|
||||||
|
'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
|
||||||
|
'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
|
||||||
|
'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
|
||||||
|
'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
|
||||||
|
'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
|
||||||
|
'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
|
||||||
|
'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY',
|
||||||
|
'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
|
||||||
|
'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
|
||||||
|
'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
|
||||||
|
'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
|
||||||
|
'Wisconsin': 'WI', 'Wyoming': 'WY'
|
||||||
|
};
|
||||||
|
return codes[state] || '';
|
||||||
|
}
|
||||||
290
frontend/src/pages/api/seo/process-queue.ts
Normal file
290
frontend/src/pages/api/seo/process-queue.ts
Normal file
@@ -0,0 +1,290 @@
|
|||||||
|
// @ts-ignore - Astro types available at build time
|
||||||
|
import type { APIRoute } from 'astro';
|
||||||
|
import { getDirectusClient, readItem, readItems, updateItem, createItem } from '@/lib/directus/client';
|
||||||
|
import { replaceYearTokens } from '@/lib/seo/velocity-scheduler';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process Queue API
|
||||||
|
*
|
||||||
|
* Runs the factory: generates all scheduled articles for an approved queue.
|
||||||
|
* Can be called by cron or manually (with limits per call).
|
||||||
|
*
|
||||||
|
* POST /api/seo/process-queue
|
||||||
|
*/
|
||||||
|
export const POST: APIRoute = async ({ request }: { request: Request }) => {
|
||||||
|
try {
|
||||||
|
const data = await request.json();
|
||||||
|
const { queue_id, batch_limit = 100 } = data;
|
||||||
|
|
||||||
|
if (!queue_id) {
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({ error: 'queue_id is required' }),
|
||||||
|
{ status: 400, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const directus = getDirectusClient();
|
||||||
|
|
||||||
|
// Get queue
|
||||||
|
const queue = await directus.request(readItem('production_queue', queue_id)) as any;
|
||||||
|
|
||||||
|
if (!queue) {
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({ error: 'Queue not found' }),
|
||||||
|
{ status: 404, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (queue.status !== 'approved' && queue.status !== 'running') {
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({ error: 'Queue must be approved to process' }),
|
||||||
|
{ status: 400, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mark as running
|
||||||
|
await directus.request(
|
||||||
|
updateItem('production_queue', queue_id, {
|
||||||
|
status: 'running',
|
||||||
|
started_at: queue.started_at || new Date().toISOString()
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
// Get campaign
|
||||||
|
const campaign = await directus.request(
|
||||||
|
readItem('campaign_masters', queue.campaign)
|
||||||
|
) as any;
|
||||||
|
|
||||||
|
// Get schedule data
|
||||||
|
const scheduleData = queue.schedule_data || [];
|
||||||
|
const startIndex = queue.completed_count || 0;
|
||||||
|
const endIndex = Math.min(startIndex + batch_limit, scheduleData.length);
|
||||||
|
const batchSchedule = scheduleData.slice(startIndex, endIndex);
|
||||||
|
|
||||||
|
if (batchSchedule.length === 0) {
|
||||||
|
// All done!
|
||||||
|
await directus.request(
|
||||||
|
updateItem('production_queue', queue_id, {
|
||||||
|
status: 'done',
|
||||||
|
completed_at: new Date().toISOString()
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({
|
||||||
|
success: true,
|
||||||
|
message: 'Queue complete',
|
||||||
|
total_generated: queue.completed_count
|
||||||
|
}),
|
||||||
|
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get locations based on filter
|
||||||
|
const locationFilter = campaign.target_locations_filter || {};
|
||||||
|
const locations = await directus.request(readItems('locations_cities', {
|
||||||
|
filter: locationFilter,
|
||||||
|
limit: batchSchedule.length,
|
||||||
|
offset: startIndex
|
||||||
|
})) as any[];
|
||||||
|
|
||||||
|
// Get recipe
|
||||||
|
const recipe = campaign.content_recipe || ['intro', 'benefits', 'howto', 'conclusion'];
|
||||||
|
|
||||||
|
let generated = 0;
|
||||||
|
const errors: string[] = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < batchSchedule.length; i++) {
|
||||||
|
const schedule = batchSchedule[i];
|
||||||
|
const location = locations[i] || locations[i % locations.length];
|
||||||
|
|
||||||
|
if (!location) continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const pubDate = new Date(schedule.publish_date);
|
||||||
|
const modDate = new Date(schedule.modified_date);
|
||||||
|
|
||||||
|
const context = {
|
||||||
|
city: location.city || location.name || '',
|
||||||
|
state: location.state || '',
|
||||||
|
county: location.county || '',
|
||||||
|
state_code: getStateCode(location.state) || ''
|
||||||
|
};
|
||||||
|
|
||||||
|
// Assemble content from modules
|
||||||
|
const { content, modulesUsed } = await assembleFromModules(
|
||||||
|
directus, campaign.site, recipe, context, pubDate
|
||||||
|
);
|
||||||
|
|
||||||
|
// Generate headline
|
||||||
|
const headline = generateHeadline(campaign.spintax_title, context, pubDate) ||
|
||||||
|
`${context.city} ${campaign.name || 'Guide'}`;
|
||||||
|
|
||||||
|
const wordCount = content.replace(/<[^>]*>/g, ' ').split(/\s+/).length;
|
||||||
|
|
||||||
|
// Create article
|
||||||
|
await directus.request(
|
||||||
|
createItem('generated_articles', {
|
||||||
|
site: queue.site,
|
||||||
|
campaign: campaign.id,
|
||||||
|
headline: headline,
|
||||||
|
meta_title: headline.substring(0, 60),
|
||||||
|
meta_description: content.replace(/<[^>]*>/g, ' ').substring(0, 155) + '...',
|
||||||
|
full_html_body: content,
|
||||||
|
word_count: wordCount,
|
||||||
|
is_published: true, // Ghost published
|
||||||
|
is_test_batch: false,
|
||||||
|
date_published: pubDate.toISOString(),
|
||||||
|
date_modified: modDate.toISOString(),
|
||||||
|
sitemap_status: 'ghost',
|
||||||
|
location_city: context.city,
|
||||||
|
location_county: context.county,
|
||||||
|
location_state: context.state,
|
||||||
|
modules_used: modulesUsed
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
generated++;
|
||||||
|
} catch (err: any) {
|
||||||
|
errors.push(`Article ${i}: ${err.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update queue progress
|
||||||
|
const newCompleted = startIndex + generated;
|
||||||
|
const isComplete = newCompleted >= scheduleData.length;
|
||||||
|
|
||||||
|
await directus.request(
|
||||||
|
updateItem('production_queue', queue_id, {
|
||||||
|
completed_count: newCompleted,
|
||||||
|
status: isComplete ? 'done' : 'running',
|
||||||
|
completed_at: isComplete ? new Date().toISOString() : null,
|
||||||
|
error_log: errors.length > 0 ? errors.join('\n') : null
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update site factory status
|
||||||
|
await directus.request(
|
||||||
|
updateItem('sites', queue.site, {
|
||||||
|
factory_status: isComplete ? 'publishing' : 'generating'
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
// Log work
|
||||||
|
await directus.request(
|
||||||
|
createItem('work_log', {
|
||||||
|
site: queue.site,
|
||||||
|
action: 'batch_generated',
|
||||||
|
entity_type: 'production_queue',
|
||||||
|
entity_id: queue_id,
|
||||||
|
details: {
|
||||||
|
generated,
|
||||||
|
errors: errors.length,
|
||||||
|
progress: `${newCompleted}/${scheduleData.length}`
|
||||||
|
}
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({
|
||||||
|
success: true,
|
||||||
|
generated,
|
||||||
|
errors: errors.length,
|
||||||
|
progress: {
|
||||||
|
completed: newCompleted,
|
||||||
|
total: scheduleData.length,
|
||||||
|
percent: Math.round((newCompleted / scheduleData.length) * 100)
|
||||||
|
},
|
||||||
|
status: isComplete ? 'done' : 'running',
|
||||||
|
next_step: isComplete
|
||||||
|
? 'Queue complete! Run sitemap-drip cron to start indexing.'
|
||||||
|
: 'Call process-queue again to continue.'
|
||||||
|
}),
|
||||||
|
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error processing queue:', error);
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({ error: 'Failed to process queue' }),
|
||||||
|
{ status: 500, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
async function assembleFromModules(
|
||||||
|
directus: any,
|
||||||
|
siteId: string,
|
||||||
|
recipe: string[],
|
||||||
|
context: any,
|
||||||
|
pubDate: Date
|
||||||
|
): Promise<{ content: string; modulesUsed: string[] }> {
|
||||||
|
const parts: string[] = [];
|
||||||
|
const modulesUsed: string[] = [];
|
||||||
|
|
||||||
|
for (const moduleType of recipe) {
|
||||||
|
const modules = await directus.request(readItems('content_modules', {
|
||||||
|
filter: {
|
||||||
|
site: { _eq: siteId },
|
||||||
|
module_type: { _eq: moduleType },
|
||||||
|
is_active: { _eq: true }
|
||||||
|
},
|
||||||
|
sort: ['usage_count'],
|
||||||
|
limit: 1
|
||||||
|
})) as any[];
|
||||||
|
|
||||||
|
if (modules.length > 0) {
|
||||||
|
const mod = modules[0];
|
||||||
|
let content = mod.content_spintax || '';
|
||||||
|
|
||||||
|
// Replace tokens
|
||||||
|
content = content
|
||||||
|
.replace(/\{City\}/gi, context.city)
|
||||||
|
.replace(/\{State\}/gi, context.state)
|
||||||
|
.replace(/\{County\}/gi, context.county)
|
||||||
|
.replace(/\{State_Code\}/gi, context.state_code);
|
||||||
|
|
||||||
|
content = replaceYearTokens(content, pubDate);
|
||||||
|
content = processSpintax(content);
|
||||||
|
|
||||||
|
parts.push(content);
|
||||||
|
modulesUsed.push(mod.id);
|
||||||
|
|
||||||
|
// Increment usage
|
||||||
|
await directus.request(updateItem('content_modules', mod.id, {
|
||||||
|
usage_count: (mod.usage_count || 0) + 1
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { content: parts.join('\n\n'), modulesUsed };
|
||||||
|
}
|
||||||
|
|
||||||
|
function processSpintax(text: string): string {
|
||||||
|
let result = text;
|
||||||
|
let iterations = 100;
|
||||||
|
while (result.includes('{') && iterations > 0) {
|
||||||
|
result = result.replace(/\{([^{}]+)\}/g, (_, opts) => {
|
||||||
|
const choices = opts.split('|');
|
||||||
|
return choices[Math.floor(Math.random() * choices.length)];
|
||||||
|
});
|
||||||
|
iterations--;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
function generateHeadline(template: string | null, context: any, date: Date): string {
|
||||||
|
if (!template) return '';
|
||||||
|
let h = template
|
||||||
|
.replace(/\{City\}/gi, context.city)
|
||||||
|
.replace(/\{State\}/gi, context.state);
|
||||||
|
h = replaceYearTokens(h, date);
|
||||||
|
return processSpintax(h);
|
||||||
|
}
|
||||||
|
|
||||||
|
function getStateCode(state: string): string {
|
||||||
|
const codes: Record<string, string> = {
|
||||||
|
'Florida': 'FL', 'Texas': 'TX', 'California': 'CA', 'New York': 'NY',
|
||||||
|
'Arizona': 'AZ', 'Nevada': 'NV', 'Georgia': 'GA', 'North Carolina': 'NC'
|
||||||
|
};
|
||||||
|
return codes[state] || state?.substring(0, 2).toUpperCase() || '';
|
||||||
|
}
|
||||||
170
frontend/src/pages/api/seo/scan-duplicates.ts
Normal file
170
frontend/src/pages/api/seo/scan-duplicates.ts
Normal file
@@ -0,0 +1,170 @@
|
|||||||
|
// @ts-ignore - Astro types available at build time
|
||||||
|
import type { APIRoute } from 'astro';
|
||||||
|
import { getDirectusClient, readItems, createItem } from '@/lib/directus/client';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scan Duplicates API
|
||||||
|
*
|
||||||
|
* Uses shingle hashing to detect duplicate N-gram sequences across articles.
|
||||||
|
* Flags any articles that share 7+ word sequences.
|
||||||
|
*
|
||||||
|
* POST /api/seo/scan-duplicates
|
||||||
|
*/
|
||||||
|
export const POST: APIRoute = async ({ request }: { request: Request }) => {
|
||||||
|
try {
|
||||||
|
const data = await request.json();
|
||||||
|
const { queue_id, batch_ids, ngram_size = 7, threshold = 3 } = data;
|
||||||
|
|
||||||
|
if (!queue_id && !batch_ids) {
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({ error: 'queue_id or batch_ids required' }),
|
||||||
|
{ status: 400, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const directus = getDirectusClient();
|
||||||
|
|
||||||
|
// Get articles to scan
|
||||||
|
let articles: any[];
|
||||||
|
if (batch_ids && Array.isArray(batch_ids)) {
|
||||||
|
articles = await directus.request(readItems('generated_articles', {
|
||||||
|
filter: { id: { _in: batch_ids } },
|
||||||
|
fields: ['id', 'site', 'headline', 'full_html_body']
|
||||||
|
})) as any[];
|
||||||
|
} else {
|
||||||
|
// Get test batch articles from queue
|
||||||
|
articles = await directus.request(readItems('generated_articles', {
|
||||||
|
filter: { is_test_batch: { _eq: true } },
|
||||||
|
sort: ['-date_created'],
|
||||||
|
limit: 20,
|
||||||
|
fields: ['id', 'site', 'headline', 'full_html_body']
|
||||||
|
})) as any[];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (articles.length < 2) {
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({
|
||||||
|
success: true,
|
||||||
|
message: 'Need at least 2 articles to compare',
|
||||||
|
flags_created: 0
|
||||||
|
}),
|
||||||
|
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build shingle sets for each article
|
||||||
|
const articleShingles: Map<string, Set<string>> = new Map();
|
||||||
|
|
||||||
|
for (const article of articles) {
|
||||||
|
const text = stripHtml(article.full_html_body || '');
|
||||||
|
const shingles = generateShingles(text, ngram_size);
|
||||||
|
articleShingles.set(article.id, shingles);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare all pairs
|
||||||
|
const collisions: Array<{
|
||||||
|
articleA: string;
|
||||||
|
articleB: string;
|
||||||
|
sharedShingles: string[];
|
||||||
|
similarity: number;
|
||||||
|
}> = [];
|
||||||
|
|
||||||
|
const articleIds = Array.from(articleShingles.keys());
|
||||||
|
|
||||||
|
for (let i = 0; i < articleIds.length; i++) {
|
||||||
|
for (let j = i + 1; j < articleIds.length; j++) {
|
||||||
|
const idA = articleIds[i];
|
||||||
|
const idB = articleIds[j];
|
||||||
|
const setA = articleShingles.get(idA)!;
|
||||||
|
const setB = articleShingles.get(idB)!;
|
||||||
|
|
||||||
|
// Find intersection
|
||||||
|
const shared = [...setA].filter(s => setB.has(s));
|
||||||
|
|
||||||
|
if (shared.length >= threshold) {
|
||||||
|
// Calculate Jaccard similarity
|
||||||
|
const union = new Set([...setA, ...setB]);
|
||||||
|
const similarity = (shared.length / union.size) * 100;
|
||||||
|
|
||||||
|
collisions.push({
|
||||||
|
articleA: idA,
|
||||||
|
articleB: idB,
|
||||||
|
sharedShingles: shared.slice(0, 5), // Just first 5 examples
|
||||||
|
similarity
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create quality flags for collisions
|
||||||
|
const siteId = articles[0]?.site;
|
||||||
|
let flagsCreated = 0;
|
||||||
|
|
||||||
|
for (const collision of collisions) {
|
||||||
|
await directus.request(
|
||||||
|
createItem('quality_flags', {
|
||||||
|
site: siteId,
|
||||||
|
batch_id: queue_id || null,
|
||||||
|
article_a: collision.articleA,
|
||||||
|
article_b: collision.articleB,
|
||||||
|
collision_text: collision.sharedShingles.join(' | '),
|
||||||
|
similarity_score: collision.similarity,
|
||||||
|
status: 'pending'
|
||||||
|
})
|
||||||
|
);
|
||||||
|
flagsCreated++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({
|
||||||
|
success: true,
|
||||||
|
articles_scanned: articles.length,
|
||||||
|
collisions_found: collisions.length,
|
||||||
|
flags_created: flagsCreated,
|
||||||
|
details: collisions.map(c => ({
|
||||||
|
article_a: c.articleA,
|
||||||
|
article_b: c.articleB,
|
||||||
|
similarity: c.similarity.toFixed(1) + '%',
|
||||||
|
examples: c.sharedShingles
|
||||||
|
}))
|
||||||
|
}),
|
||||||
|
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error scanning duplicates:', error);
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({ error: 'Failed to scan duplicates' }),
|
||||||
|
{ status: 500, headers: { 'Content-Type': 'application/json' } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Strip HTML tags and normalize text
|
||||||
|
*/
|
||||||
|
function stripHtml(html: string): string {
|
||||||
|
return html
|
||||||
|
.replace(/<[^>]*>/g, ' ')
|
||||||
|
.replace(/ /g, ' ')
|
||||||
|
.replace(/&/g, '&')
|
||||||
|
.replace(/</g, '<')
|
||||||
|
.replace(/>/g, '>')
|
||||||
|
.replace(/\s+/g, ' ')
|
||||||
|
.trim()
|
||||||
|
.toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate N-gram shingles from text
|
||||||
|
*/
|
||||||
|
function generateShingles(text: string, n: number): Set<string> {
|
||||||
|
const words = text.split(/\s+/).filter(w => w.length > 2);
|
||||||
|
const shingles = new Set<string>();
|
||||||
|
|
||||||
|
for (let i = 0; i <= words.length - n; i++) {
|
||||||
|
const shingle = words.slice(i, i + n).join(' ');
|
||||||
|
shingles.add(shingle);
|
||||||
|
}
|
||||||
|
|
||||||
|
return shingles;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user