feat: Ultimate Edition - Modular assembly, duplicate scanner, content modules, quality flags

This commit is contained in:
cawcenter
2025-12-12 11:45:33 -05:00
parent 0576967bd5
commit 48bf7778e9
3 changed files with 684 additions and 0 deletions

View File

@@ -0,0 +1,224 @@
// @ts-ignore - Astro types available at build time
import type { APIRoute } from 'astro';
import { getDirectusClient, readItems, createItem, updateItem } from '@/lib/directus/client';
import { replaceYearTokens } from '@/lib/seo/velocity-scheduler';
/**
* Assemble Article API
*
* Builds a full article from content modules based on campaign recipe.
* Uses lowest usage_count modules to ensure variety.
*
* POST /api/seo/assemble-article
*/
export const POST: APIRoute = async ({ request }: { request: Request }) => {
try {
const data = await request.json();
const {
campaign_id,
location, // { city, state, county }
publish_date,
modified_date
} = data;
if (!campaign_id || !location) {
return new Response(
JSON.stringify({ error: 'campaign_id and location required' }),
{ status: 400, headers: { 'Content-Type': 'application/json' } }
);
}
const directus = getDirectusClient();
// Get campaign with recipe
const campaigns = await directus.request(readItems('campaign_masters', {
filter: { id: { _eq: campaign_id } },
limit: 1
})) as any[];
const campaign = campaigns[0];
if (!campaign) {
return new Response(
JSON.stringify({ error: 'Campaign not found' }),
{ status: 404, headers: { 'Content-Type': 'application/json' } }
);
}
const recipe = campaign.content_recipe || ['intro', 'benefits', 'howto', 'conclusion'];
const pubDate = publish_date ? new Date(publish_date) : new Date();
const modDate = modified_date ? new Date(modified_date) : new Date();
// Build context for token replacement
const context = {
city: location.city || '',
state: location.state || '',
county: location.county || '',
state_code: getStateCode(location.state) || '',
year: pubDate.getFullYear()
};
// Fetch and assemble modules
const assembledParts: string[] = [];
const modulesUsed: string[] = [];
for (const moduleType of recipe) {
// Get modules of this type, prefer lowest usage_count
const modules = await directus.request(readItems('content_modules', {
filter: {
site: { _eq: campaign.site },
module_type: { _eq: moduleType },
is_active: { _eq: true }
},
sort: ['usage_count', 'id'], // Lowest usage first
limit: 1
})) as any[];
if (modules.length > 0) {
const module = modules[0];
// Process spintax
let content = module.content_spintax || '';
// Replace location tokens
content = content
.replace(/\{City\}/gi, context.city)
.replace(/\{State\}/gi, context.state)
.replace(/\{County\}/gi, context.county)
.replace(/\{State_Code\}/gi, context.state_code)
.replace(/\{Location_City\}/gi, context.city)
.replace(/\{Location_State\}/gi, context.state);
// Replace year tokens
content = replaceYearTokens(content, pubDate);
// Process spintax syntax
content = processSpintax(content);
assembledParts.push(content);
modulesUsed.push(module.id);
// Increment usage count
await directus.request(
updateItem('content_modules', module.id, {
usage_count: (module.usage_count || 0) + 1
})
);
}
}
const fullContent = assembledParts.join('\n\n');
// Generate headline from intro
const headline = generateHeadline(campaign.spintax_title, context, pubDate) ||
`${context.city} ${campaign.name || 'Guide'}`;
// Generate meta
const metaTitle = headline.substring(0, 60);
const metaDescription = stripHtml(fullContent).substring(0, 155) + '...';
// Count words
const wordCount = stripHtml(fullContent).split(/\s+/).length;
// Create article
const article = await directus.request(
createItem('generated_articles', {
site: campaign.site,
campaign: campaign_id,
headline: headline,
meta_title: metaTitle,
meta_description: metaDescription,
full_html_body: fullContent,
word_count: wordCount,
is_published: false,
is_test_batch: false,
date_published: pubDate.toISOString(),
date_modified: modDate.toISOString(),
sitemap_status: 'ghost',
location_city: context.city,
location_county: context.county,
location_state: context.state,
modules_used: modulesUsed
})
) as any;
return new Response(
JSON.stringify({
success: true,
article_id: article.id,
headline,
word_count: wordCount,
modules_used: modulesUsed.length,
dates: {
published: pubDate.toISOString(),
modified: modDate.toISOString()
}
}),
{ status: 200, headers: { 'Content-Type': 'application/json' } }
);
} catch (error) {
console.error('Error assembling article:', error);
return new Response(
JSON.stringify({ error: 'Failed to assemble article' }),
{ status: 500, headers: { 'Content-Type': 'application/json' } }
);
}
};
/**
* Process spintax syntax: {option1|option2|option3}
*/
function processSpintax(text: string): string {
// Match nested spintax from innermost to outermost
let result = text;
let maxIterations = 100;
while (result.includes('{') && maxIterations > 0) {
result = result.replace(/\{([^{}]+)\}/g, (match, options) => {
const choices = options.split('|');
return choices[Math.floor(Math.random() * choices.length)];
});
maxIterations--;
}
return result;
}
/**
* Generate headline with spintax and tokens
*/
function generateHeadline(template: string | null, context: any, date: Date): string {
if (!template) return '';
let headline = template
.replace(/\{City\}/gi, context.city)
.replace(/\{State\}/gi, context.state)
.replace(/\{County\}/gi, context.county);
headline = replaceYearTokens(headline, date);
headline = processSpintax(headline);
return headline;
}
function stripHtml(html: string): string {
return html.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' ').trim();
}
function getStateCode(state: string): string {
const codes: Record<string, string> = {
'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY',
'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
'Wisconsin': 'WI', 'Wyoming': 'WY'
};
return codes[state] || '';
}

View File

@@ -0,0 +1,290 @@
// @ts-ignore - Astro types available at build time
import type { APIRoute } from 'astro';
import { getDirectusClient, readItem, readItems, updateItem, createItem } from '@/lib/directus/client';
import { replaceYearTokens } from '@/lib/seo/velocity-scheduler';
/**
* Process Queue API
*
* Runs the factory: generates all scheduled articles for an approved queue.
* Can be called by cron or manually (with limits per call).
*
* POST /api/seo/process-queue
*/
export const POST: APIRoute = async ({ request }: { request: Request }) => {
try {
const data = await request.json();
const { queue_id, batch_limit = 100 } = data;
if (!queue_id) {
return new Response(
JSON.stringify({ error: 'queue_id is required' }),
{ status: 400, headers: { 'Content-Type': 'application/json' } }
);
}
const directus = getDirectusClient();
// Get queue
const queue = await directus.request(readItem('production_queue', queue_id)) as any;
if (!queue) {
return new Response(
JSON.stringify({ error: 'Queue not found' }),
{ status: 404, headers: { 'Content-Type': 'application/json' } }
);
}
if (queue.status !== 'approved' && queue.status !== 'running') {
return new Response(
JSON.stringify({ error: 'Queue must be approved to process' }),
{ status: 400, headers: { 'Content-Type': 'application/json' } }
);
}
// Mark as running
await directus.request(
updateItem('production_queue', queue_id, {
status: 'running',
started_at: queue.started_at || new Date().toISOString()
})
);
// Get campaign
const campaign = await directus.request(
readItem('campaign_masters', queue.campaign)
) as any;
// Get schedule data
const scheduleData = queue.schedule_data || [];
const startIndex = queue.completed_count || 0;
const endIndex = Math.min(startIndex + batch_limit, scheduleData.length);
const batchSchedule = scheduleData.slice(startIndex, endIndex);
if (batchSchedule.length === 0) {
// All done!
await directus.request(
updateItem('production_queue', queue_id, {
status: 'done',
completed_at: new Date().toISOString()
})
);
return new Response(
JSON.stringify({
success: true,
message: 'Queue complete',
total_generated: queue.completed_count
}),
{ status: 200, headers: { 'Content-Type': 'application/json' } }
);
}
// Get locations based on filter
const locationFilter = campaign.target_locations_filter || {};
const locations = await directus.request(readItems('locations_cities', {
filter: locationFilter,
limit: batchSchedule.length,
offset: startIndex
})) as any[];
// Get recipe
const recipe = campaign.content_recipe || ['intro', 'benefits', 'howto', 'conclusion'];
let generated = 0;
const errors: string[] = [];
for (let i = 0; i < batchSchedule.length; i++) {
const schedule = batchSchedule[i];
const location = locations[i] || locations[i % locations.length];
if (!location) continue;
try {
const pubDate = new Date(schedule.publish_date);
const modDate = new Date(schedule.modified_date);
const context = {
city: location.city || location.name || '',
state: location.state || '',
county: location.county || '',
state_code: getStateCode(location.state) || ''
};
// Assemble content from modules
const { content, modulesUsed } = await assembleFromModules(
directus, campaign.site, recipe, context, pubDate
);
// Generate headline
const headline = generateHeadline(campaign.spintax_title, context, pubDate) ||
`${context.city} ${campaign.name || 'Guide'}`;
const wordCount = content.replace(/<[^>]*>/g, ' ').split(/\s+/).length;
// Create article
await directus.request(
createItem('generated_articles', {
site: queue.site,
campaign: campaign.id,
headline: headline,
meta_title: headline.substring(0, 60),
meta_description: content.replace(/<[^>]*>/g, ' ').substring(0, 155) + '...',
full_html_body: content,
word_count: wordCount,
is_published: true, // Ghost published
is_test_batch: false,
date_published: pubDate.toISOString(),
date_modified: modDate.toISOString(),
sitemap_status: 'ghost',
location_city: context.city,
location_county: context.county,
location_state: context.state,
modules_used: modulesUsed
})
);
generated++;
} catch (err: any) {
errors.push(`Article ${i}: ${err.message}`);
}
}
// Update queue progress
const newCompleted = startIndex + generated;
const isComplete = newCompleted >= scheduleData.length;
await directus.request(
updateItem('production_queue', queue_id, {
completed_count: newCompleted,
status: isComplete ? 'done' : 'running',
completed_at: isComplete ? new Date().toISOString() : null,
error_log: errors.length > 0 ? errors.join('\n') : null
})
);
// Update site factory status
await directus.request(
updateItem('sites', queue.site, {
factory_status: isComplete ? 'publishing' : 'generating'
})
);
// Log work
await directus.request(
createItem('work_log', {
site: queue.site,
action: 'batch_generated',
entity_type: 'production_queue',
entity_id: queue_id,
details: {
generated,
errors: errors.length,
progress: `${newCompleted}/${scheduleData.length}`
}
})
);
return new Response(
JSON.stringify({
success: true,
generated,
errors: errors.length,
progress: {
completed: newCompleted,
total: scheduleData.length,
percent: Math.round((newCompleted / scheduleData.length) * 100)
},
status: isComplete ? 'done' : 'running',
next_step: isComplete
? 'Queue complete! Run sitemap-drip cron to start indexing.'
: 'Call process-queue again to continue.'
}),
{ status: 200, headers: { 'Content-Type': 'application/json' } }
);
} catch (error) {
console.error('Error processing queue:', error);
return new Response(
JSON.stringify({ error: 'Failed to process queue' }),
{ status: 500, headers: { 'Content-Type': 'application/json' } }
);
}
};
async function assembleFromModules(
directus: any,
siteId: string,
recipe: string[],
context: any,
pubDate: Date
): Promise<{ content: string; modulesUsed: string[] }> {
const parts: string[] = [];
const modulesUsed: string[] = [];
for (const moduleType of recipe) {
const modules = await directus.request(readItems('content_modules', {
filter: {
site: { _eq: siteId },
module_type: { _eq: moduleType },
is_active: { _eq: true }
},
sort: ['usage_count'],
limit: 1
})) as any[];
if (modules.length > 0) {
const mod = modules[0];
let content = mod.content_spintax || '';
// Replace tokens
content = content
.replace(/\{City\}/gi, context.city)
.replace(/\{State\}/gi, context.state)
.replace(/\{County\}/gi, context.county)
.replace(/\{State_Code\}/gi, context.state_code);
content = replaceYearTokens(content, pubDate);
content = processSpintax(content);
parts.push(content);
modulesUsed.push(mod.id);
// Increment usage
await directus.request(updateItem('content_modules', mod.id, {
usage_count: (mod.usage_count || 0) + 1
}));
}
}
return { content: parts.join('\n\n'), modulesUsed };
}
function processSpintax(text: string): string {
let result = text;
let iterations = 100;
while (result.includes('{') && iterations > 0) {
result = result.replace(/\{([^{}]+)\}/g, (_, opts) => {
const choices = opts.split('|');
return choices[Math.floor(Math.random() * choices.length)];
});
iterations--;
}
return result;
}
function generateHeadline(template: string | null, context: any, date: Date): string {
if (!template) return '';
let h = template
.replace(/\{City\}/gi, context.city)
.replace(/\{State\}/gi, context.state);
h = replaceYearTokens(h, date);
return processSpintax(h);
}
function getStateCode(state: string): string {
const codes: Record<string, string> = {
'Florida': 'FL', 'Texas': 'TX', 'California': 'CA', 'New York': 'NY',
'Arizona': 'AZ', 'Nevada': 'NV', 'Georgia': 'GA', 'North Carolina': 'NC'
};
return codes[state] || state?.substring(0, 2).toUpperCase() || '';
}

View File

@@ -0,0 +1,170 @@
// @ts-ignore - Astro types available at build time
import type { APIRoute } from 'astro';
import { getDirectusClient, readItems, createItem } from '@/lib/directus/client';
/**
* Scan Duplicates API
*
* Uses shingle hashing to detect duplicate N-gram sequences across articles.
* Flags any articles that share 7+ word sequences.
*
* POST /api/seo/scan-duplicates
*/
export const POST: APIRoute = async ({ request }: { request: Request }) => {
try {
const data = await request.json();
const { queue_id, batch_ids, ngram_size = 7, threshold = 3 } = data;
if (!queue_id && !batch_ids) {
return new Response(
JSON.stringify({ error: 'queue_id or batch_ids required' }),
{ status: 400, headers: { 'Content-Type': 'application/json' } }
);
}
const directus = getDirectusClient();
// Get articles to scan
let articles: any[];
if (batch_ids && Array.isArray(batch_ids)) {
articles = await directus.request(readItems('generated_articles', {
filter: { id: { _in: batch_ids } },
fields: ['id', 'site', 'headline', 'full_html_body']
})) as any[];
} else {
// Get test batch articles from queue
articles = await directus.request(readItems('generated_articles', {
filter: { is_test_batch: { _eq: true } },
sort: ['-date_created'],
limit: 20,
fields: ['id', 'site', 'headline', 'full_html_body']
})) as any[];
}
if (articles.length < 2) {
return new Response(
JSON.stringify({
success: true,
message: 'Need at least 2 articles to compare',
flags_created: 0
}),
{ status: 200, headers: { 'Content-Type': 'application/json' } }
);
}
// Build shingle sets for each article
const articleShingles: Map<string, Set<string>> = new Map();
for (const article of articles) {
const text = stripHtml(article.full_html_body || '');
const shingles = generateShingles(text, ngram_size);
articleShingles.set(article.id, shingles);
}
// Compare all pairs
const collisions: Array<{
articleA: string;
articleB: string;
sharedShingles: string[];
similarity: number;
}> = [];
const articleIds = Array.from(articleShingles.keys());
for (let i = 0; i < articleIds.length; i++) {
for (let j = i + 1; j < articleIds.length; j++) {
const idA = articleIds[i];
const idB = articleIds[j];
const setA = articleShingles.get(idA)!;
const setB = articleShingles.get(idB)!;
// Find intersection
const shared = [...setA].filter(s => setB.has(s));
if (shared.length >= threshold) {
// Calculate Jaccard similarity
const union = new Set([...setA, ...setB]);
const similarity = (shared.length / union.size) * 100;
collisions.push({
articleA: idA,
articleB: idB,
sharedShingles: shared.slice(0, 5), // Just first 5 examples
similarity
});
}
}
}
// Create quality flags for collisions
const siteId = articles[0]?.site;
let flagsCreated = 0;
for (const collision of collisions) {
await directus.request(
createItem('quality_flags', {
site: siteId,
batch_id: queue_id || null,
article_a: collision.articleA,
article_b: collision.articleB,
collision_text: collision.sharedShingles.join(' | '),
similarity_score: collision.similarity,
status: 'pending'
})
);
flagsCreated++;
}
return new Response(
JSON.stringify({
success: true,
articles_scanned: articles.length,
collisions_found: collisions.length,
flags_created: flagsCreated,
details: collisions.map(c => ({
article_a: c.articleA,
article_b: c.articleB,
similarity: c.similarity.toFixed(1) + '%',
examples: c.sharedShingles
}))
}),
{ status: 200, headers: { 'Content-Type': 'application/json' } }
);
} catch (error) {
console.error('Error scanning duplicates:', error);
return new Response(
JSON.stringify({ error: 'Failed to scan duplicates' }),
{ status: 500, headers: { 'Content-Type': 'application/json' } }
);
}
};
/**
* Strip HTML tags and normalize text
*/
function stripHtml(html: string): string {
return html
.replace(/<[^>]*>/g, ' ')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/\s+/g, ' ')
.trim()
.toLowerCase();
}
/**
* Generate N-gram shingles from text
*/
function generateShingles(text: string, n: number): Set<string> {
const words = text.split(/\s+/).filter(w => w.length > 2);
const shingles = new Set<string>();
for (let i = 0; i <= words.length - n; i++) {
const shingle = words.slice(i, i + n).join(' ');
shingles.add(shingle);
}
return shingles;
}