From 48bf7778e927f57ec73d56b3f61de6196240962e Mon Sep 17 00:00:00 2001 From: cawcenter Date: Fri, 12 Dec 2025 11:45:33 -0500 Subject: [PATCH] feat: Ultimate Edition - Modular assembly, duplicate scanner, content modules, quality flags --- .../src/pages/api/seo/assemble-article.ts | 224 ++++++++++++++ frontend/src/pages/api/seo/process-queue.ts | 290 ++++++++++++++++++ frontend/src/pages/api/seo/scan-duplicates.ts | 170 ++++++++++ 3 files changed, 684 insertions(+) create mode 100644 frontend/src/pages/api/seo/assemble-article.ts create mode 100644 frontend/src/pages/api/seo/process-queue.ts create mode 100644 frontend/src/pages/api/seo/scan-duplicates.ts diff --git a/frontend/src/pages/api/seo/assemble-article.ts b/frontend/src/pages/api/seo/assemble-article.ts new file mode 100644 index 0000000..1525a38 --- /dev/null +++ b/frontend/src/pages/api/seo/assemble-article.ts @@ -0,0 +1,224 @@ +// @ts-ignore - Astro types available at build time +import type { APIRoute } from 'astro'; +import { getDirectusClient, readItems, createItem, updateItem } from '@/lib/directus/client'; +import { replaceYearTokens } from '@/lib/seo/velocity-scheduler'; + +/** + * Assemble Article API + * + * Builds a full article from content modules based on campaign recipe. + * Uses lowest usage_count modules to ensure variety. + * + * POST /api/seo/assemble-article + */ +export const POST: APIRoute = async ({ request }: { request: Request }) => { + try { + const data = await request.json(); + const { + campaign_id, + location, // { city, state, county } + publish_date, + modified_date + } = data; + + if (!campaign_id || !location) { + return new Response( + JSON.stringify({ error: 'campaign_id and location required' }), + { status: 400, headers: { 'Content-Type': 'application/json' } } + ); + } + + const directus = getDirectusClient(); + + // Get campaign with recipe + const campaigns = await directus.request(readItems('campaign_masters', { + filter: { id: { _eq: campaign_id } }, + limit: 1 + })) as any[]; + + const campaign = campaigns[0]; + if (!campaign) { + return new Response( + JSON.stringify({ error: 'Campaign not found' }), + { status: 404, headers: { 'Content-Type': 'application/json' } } + ); + } + + const recipe = campaign.content_recipe || ['intro', 'benefits', 'howto', 'conclusion']; + const pubDate = publish_date ? new Date(publish_date) : new Date(); + const modDate = modified_date ? new Date(modified_date) : new Date(); + + // Build context for token replacement + const context = { + city: location.city || '', + state: location.state || '', + county: location.county || '', + state_code: getStateCode(location.state) || '', + year: pubDate.getFullYear() + }; + + // Fetch and assemble modules + const assembledParts: string[] = []; + const modulesUsed: string[] = []; + + for (const moduleType of recipe) { + // Get modules of this type, prefer lowest usage_count + const modules = await directus.request(readItems('content_modules', { + filter: { + site: { _eq: campaign.site }, + module_type: { _eq: moduleType }, + is_active: { _eq: true } + }, + sort: ['usage_count', 'id'], // Lowest usage first + limit: 1 + })) as any[]; + + if (modules.length > 0) { + const module = modules[0]; + + // Process spintax + let content = module.content_spintax || ''; + + // Replace location tokens + content = content + .replace(/\{City\}/gi, context.city) + .replace(/\{State\}/gi, context.state) + .replace(/\{County\}/gi, context.county) + .replace(/\{State_Code\}/gi, context.state_code) + .replace(/\{Location_City\}/gi, context.city) + .replace(/\{Location_State\}/gi, context.state); + + // Replace year tokens + content = replaceYearTokens(content, pubDate); + + // Process spintax syntax + content = processSpintax(content); + + assembledParts.push(content); + modulesUsed.push(module.id); + + // Increment usage count + await directus.request( + updateItem('content_modules', module.id, { + usage_count: (module.usage_count || 0) + 1 + }) + ); + } + } + + const fullContent = assembledParts.join('\n\n'); + + // Generate headline from intro + const headline = generateHeadline(campaign.spintax_title, context, pubDate) || + `${context.city} ${campaign.name || 'Guide'}`; + + // Generate meta + const metaTitle = headline.substring(0, 60); + const metaDescription = stripHtml(fullContent).substring(0, 155) + '...'; + + // Count words + const wordCount = stripHtml(fullContent).split(/\s+/).length; + + // Create article + const article = await directus.request( + createItem('generated_articles', { + site: campaign.site, + campaign: campaign_id, + headline: headline, + meta_title: metaTitle, + meta_description: metaDescription, + full_html_body: fullContent, + word_count: wordCount, + is_published: false, + is_test_batch: false, + date_published: pubDate.toISOString(), + date_modified: modDate.toISOString(), + sitemap_status: 'ghost', + location_city: context.city, + location_county: context.county, + location_state: context.state, + modules_used: modulesUsed + }) + ) as any; + + return new Response( + JSON.stringify({ + success: true, + article_id: article.id, + headline, + word_count: wordCount, + modules_used: modulesUsed.length, + dates: { + published: pubDate.toISOString(), + modified: modDate.toISOString() + } + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } catch (error) { + console.error('Error assembling article:', error); + return new Response( + JSON.stringify({ error: 'Failed to assemble article' }), + { status: 500, headers: { 'Content-Type': 'application/json' } } + ); + } +}; + +/** + * Process spintax syntax: {option1|option2|option3} + */ +function processSpintax(text: string): string { + // Match nested spintax from innermost to outermost + let result = text; + let maxIterations = 100; + + while (result.includes('{') && maxIterations > 0) { + result = result.replace(/\{([^{}]+)\}/g, (match, options) => { + const choices = options.split('|'); + return choices[Math.floor(Math.random() * choices.length)]; + }); + maxIterations--; + } + + return result; +} + +/** + * Generate headline with spintax and tokens + */ +function generateHeadline(template: string | null, context: any, date: Date): string { + if (!template) return ''; + + let headline = template + .replace(/\{City\}/gi, context.city) + .replace(/\{State\}/gi, context.state) + .replace(/\{County\}/gi, context.county); + + headline = replaceYearTokens(headline, date); + headline = processSpintax(headline); + + return headline; +} + +function stripHtml(html: string): string { + return html.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' ').trim(); +} + +function getStateCode(state: string): string { + const codes: Record = { + 'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', + 'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', + 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', + 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', + 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', + 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', + 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', + 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', + 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', + 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC', + 'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', + 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', + 'Wisconsin': 'WI', 'Wyoming': 'WY' + }; + return codes[state] || ''; +} diff --git a/frontend/src/pages/api/seo/process-queue.ts b/frontend/src/pages/api/seo/process-queue.ts new file mode 100644 index 0000000..cf68390 --- /dev/null +++ b/frontend/src/pages/api/seo/process-queue.ts @@ -0,0 +1,290 @@ +// @ts-ignore - Astro types available at build time +import type { APIRoute } from 'astro'; +import { getDirectusClient, readItem, readItems, updateItem, createItem } from '@/lib/directus/client'; +import { replaceYearTokens } from '@/lib/seo/velocity-scheduler'; + +/** + * Process Queue API + * + * Runs the factory: generates all scheduled articles for an approved queue. + * Can be called by cron or manually (with limits per call). + * + * POST /api/seo/process-queue + */ +export const POST: APIRoute = async ({ request }: { request: Request }) => { + try { + const data = await request.json(); + const { queue_id, batch_limit = 100 } = data; + + if (!queue_id) { + return new Response( + JSON.stringify({ error: 'queue_id is required' }), + { status: 400, headers: { 'Content-Type': 'application/json' } } + ); + } + + const directus = getDirectusClient(); + + // Get queue + const queue = await directus.request(readItem('production_queue', queue_id)) as any; + + if (!queue) { + return new Response( + JSON.stringify({ error: 'Queue not found' }), + { status: 404, headers: { 'Content-Type': 'application/json' } } + ); + } + + if (queue.status !== 'approved' && queue.status !== 'running') { + return new Response( + JSON.stringify({ error: 'Queue must be approved to process' }), + { status: 400, headers: { 'Content-Type': 'application/json' } } + ); + } + + // Mark as running + await directus.request( + updateItem('production_queue', queue_id, { + status: 'running', + started_at: queue.started_at || new Date().toISOString() + }) + ); + + // Get campaign + const campaign = await directus.request( + readItem('campaign_masters', queue.campaign) + ) as any; + + // Get schedule data + const scheduleData = queue.schedule_data || []; + const startIndex = queue.completed_count || 0; + const endIndex = Math.min(startIndex + batch_limit, scheduleData.length); + const batchSchedule = scheduleData.slice(startIndex, endIndex); + + if (batchSchedule.length === 0) { + // All done! + await directus.request( + updateItem('production_queue', queue_id, { + status: 'done', + completed_at: new Date().toISOString() + }) + ); + + return new Response( + JSON.stringify({ + success: true, + message: 'Queue complete', + total_generated: queue.completed_count + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } + + // Get locations based on filter + const locationFilter = campaign.target_locations_filter || {}; + const locations = await directus.request(readItems('locations_cities', { + filter: locationFilter, + limit: batchSchedule.length, + offset: startIndex + })) as any[]; + + // Get recipe + const recipe = campaign.content_recipe || ['intro', 'benefits', 'howto', 'conclusion']; + + let generated = 0; + const errors: string[] = []; + + for (let i = 0; i < batchSchedule.length; i++) { + const schedule = batchSchedule[i]; + const location = locations[i] || locations[i % locations.length]; + + if (!location) continue; + + try { + const pubDate = new Date(schedule.publish_date); + const modDate = new Date(schedule.modified_date); + + const context = { + city: location.city || location.name || '', + state: location.state || '', + county: location.county || '', + state_code: getStateCode(location.state) || '' + }; + + // Assemble content from modules + const { content, modulesUsed } = await assembleFromModules( + directus, campaign.site, recipe, context, pubDate + ); + + // Generate headline + const headline = generateHeadline(campaign.spintax_title, context, pubDate) || + `${context.city} ${campaign.name || 'Guide'}`; + + const wordCount = content.replace(/<[^>]*>/g, ' ').split(/\s+/).length; + + // Create article + await directus.request( + createItem('generated_articles', { + site: queue.site, + campaign: campaign.id, + headline: headline, + meta_title: headline.substring(0, 60), + meta_description: content.replace(/<[^>]*>/g, ' ').substring(0, 155) + '...', + full_html_body: content, + word_count: wordCount, + is_published: true, // Ghost published + is_test_batch: false, + date_published: pubDate.toISOString(), + date_modified: modDate.toISOString(), + sitemap_status: 'ghost', + location_city: context.city, + location_county: context.county, + location_state: context.state, + modules_used: modulesUsed + }) + ); + + generated++; + } catch (err: any) { + errors.push(`Article ${i}: ${err.message}`); + } + } + + // Update queue progress + const newCompleted = startIndex + generated; + const isComplete = newCompleted >= scheduleData.length; + + await directus.request( + updateItem('production_queue', queue_id, { + completed_count: newCompleted, + status: isComplete ? 'done' : 'running', + completed_at: isComplete ? new Date().toISOString() : null, + error_log: errors.length > 0 ? errors.join('\n') : null + }) + ); + + // Update site factory status + await directus.request( + updateItem('sites', queue.site, { + factory_status: isComplete ? 'publishing' : 'generating' + }) + ); + + // Log work + await directus.request( + createItem('work_log', { + site: queue.site, + action: 'batch_generated', + entity_type: 'production_queue', + entity_id: queue_id, + details: { + generated, + errors: errors.length, + progress: `${newCompleted}/${scheduleData.length}` + } + }) + ); + + return new Response( + JSON.stringify({ + success: true, + generated, + errors: errors.length, + progress: { + completed: newCompleted, + total: scheduleData.length, + percent: Math.round((newCompleted / scheduleData.length) * 100) + }, + status: isComplete ? 'done' : 'running', + next_step: isComplete + ? 'Queue complete! Run sitemap-drip cron to start indexing.' + : 'Call process-queue again to continue.' + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } catch (error) { + console.error('Error processing queue:', error); + return new Response( + JSON.stringify({ error: 'Failed to process queue' }), + { status: 500, headers: { 'Content-Type': 'application/json' } } + ); + } +}; + +async function assembleFromModules( + directus: any, + siteId: string, + recipe: string[], + context: any, + pubDate: Date +): Promise<{ content: string; modulesUsed: string[] }> { + const parts: string[] = []; + const modulesUsed: string[] = []; + + for (const moduleType of recipe) { + const modules = await directus.request(readItems('content_modules', { + filter: { + site: { _eq: siteId }, + module_type: { _eq: moduleType }, + is_active: { _eq: true } + }, + sort: ['usage_count'], + limit: 1 + })) as any[]; + + if (modules.length > 0) { + const mod = modules[0]; + let content = mod.content_spintax || ''; + + // Replace tokens + content = content + .replace(/\{City\}/gi, context.city) + .replace(/\{State\}/gi, context.state) + .replace(/\{County\}/gi, context.county) + .replace(/\{State_Code\}/gi, context.state_code); + + content = replaceYearTokens(content, pubDate); + content = processSpintax(content); + + parts.push(content); + modulesUsed.push(mod.id); + + // Increment usage + await directus.request(updateItem('content_modules', mod.id, { + usage_count: (mod.usage_count || 0) + 1 + })); + } + } + + return { content: parts.join('\n\n'), modulesUsed }; +} + +function processSpintax(text: string): string { + let result = text; + let iterations = 100; + while (result.includes('{') && iterations > 0) { + result = result.replace(/\{([^{}]+)\}/g, (_, opts) => { + const choices = opts.split('|'); + return choices[Math.floor(Math.random() * choices.length)]; + }); + iterations--; + } + return result; +} + +function generateHeadline(template: string | null, context: any, date: Date): string { + if (!template) return ''; + let h = template + .replace(/\{City\}/gi, context.city) + .replace(/\{State\}/gi, context.state); + h = replaceYearTokens(h, date); + return processSpintax(h); +} + +function getStateCode(state: string): string { + const codes: Record = { + 'Florida': 'FL', 'Texas': 'TX', 'California': 'CA', 'New York': 'NY', + 'Arizona': 'AZ', 'Nevada': 'NV', 'Georgia': 'GA', 'North Carolina': 'NC' + }; + return codes[state] || state?.substring(0, 2).toUpperCase() || ''; +} diff --git a/frontend/src/pages/api/seo/scan-duplicates.ts b/frontend/src/pages/api/seo/scan-duplicates.ts new file mode 100644 index 0000000..839db64 --- /dev/null +++ b/frontend/src/pages/api/seo/scan-duplicates.ts @@ -0,0 +1,170 @@ +// @ts-ignore - Astro types available at build time +import type { APIRoute } from 'astro'; +import { getDirectusClient, readItems, createItem } from '@/lib/directus/client'; + +/** + * Scan Duplicates API + * + * Uses shingle hashing to detect duplicate N-gram sequences across articles. + * Flags any articles that share 7+ word sequences. + * + * POST /api/seo/scan-duplicates + */ +export const POST: APIRoute = async ({ request }: { request: Request }) => { + try { + const data = await request.json(); + const { queue_id, batch_ids, ngram_size = 7, threshold = 3 } = data; + + if (!queue_id && !batch_ids) { + return new Response( + JSON.stringify({ error: 'queue_id or batch_ids required' }), + { status: 400, headers: { 'Content-Type': 'application/json' } } + ); + } + + const directus = getDirectusClient(); + + // Get articles to scan + let articles: any[]; + if (batch_ids && Array.isArray(batch_ids)) { + articles = await directus.request(readItems('generated_articles', { + filter: { id: { _in: batch_ids } }, + fields: ['id', 'site', 'headline', 'full_html_body'] + })) as any[]; + } else { + // Get test batch articles from queue + articles = await directus.request(readItems('generated_articles', { + filter: { is_test_batch: { _eq: true } }, + sort: ['-date_created'], + limit: 20, + fields: ['id', 'site', 'headline', 'full_html_body'] + })) as any[]; + } + + if (articles.length < 2) { + return new Response( + JSON.stringify({ + success: true, + message: 'Need at least 2 articles to compare', + flags_created: 0 + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } + + // Build shingle sets for each article + const articleShingles: Map> = new Map(); + + for (const article of articles) { + const text = stripHtml(article.full_html_body || ''); + const shingles = generateShingles(text, ngram_size); + articleShingles.set(article.id, shingles); + } + + // Compare all pairs + const collisions: Array<{ + articleA: string; + articleB: string; + sharedShingles: string[]; + similarity: number; + }> = []; + + const articleIds = Array.from(articleShingles.keys()); + + for (let i = 0; i < articleIds.length; i++) { + for (let j = i + 1; j < articleIds.length; j++) { + const idA = articleIds[i]; + const idB = articleIds[j]; + const setA = articleShingles.get(idA)!; + const setB = articleShingles.get(idB)!; + + // Find intersection + const shared = [...setA].filter(s => setB.has(s)); + + if (shared.length >= threshold) { + // Calculate Jaccard similarity + const union = new Set([...setA, ...setB]); + const similarity = (shared.length / union.size) * 100; + + collisions.push({ + articleA: idA, + articleB: idB, + sharedShingles: shared.slice(0, 5), // Just first 5 examples + similarity + }); + } + } + } + + // Create quality flags for collisions + const siteId = articles[0]?.site; + let flagsCreated = 0; + + for (const collision of collisions) { + await directus.request( + createItem('quality_flags', { + site: siteId, + batch_id: queue_id || null, + article_a: collision.articleA, + article_b: collision.articleB, + collision_text: collision.sharedShingles.join(' | '), + similarity_score: collision.similarity, + status: 'pending' + }) + ); + flagsCreated++; + } + + return new Response( + JSON.stringify({ + success: true, + articles_scanned: articles.length, + collisions_found: collisions.length, + flags_created: flagsCreated, + details: collisions.map(c => ({ + article_a: c.articleA, + article_b: c.articleB, + similarity: c.similarity.toFixed(1) + '%', + examples: c.sharedShingles + })) + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } catch (error) { + console.error('Error scanning duplicates:', error); + return new Response( + JSON.stringify({ error: 'Failed to scan duplicates' }), + { status: 500, headers: { 'Content-Type': 'application/json' } } + ); + } +}; + +/** + * Strip HTML tags and normalize text + */ +function stripHtml(html: string): string { + return html + .replace(/<[^>]*>/g, ' ') + .replace(/ /g, ' ') + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/\s+/g, ' ') + .trim() + .toLowerCase(); +} + +/** + * Generate N-gram shingles from text + */ +function generateShingles(text: string, n: number): Set { + const words = text.split(/\s+/).filter(w => w.length > 2); + const shingles = new Set(); + + for (let i = 0; i <= words.length - n; i++) { + const shingle = words.slice(i, i + n).join(' '); + shingles.add(shingle); + } + + return shingles; +}