feat: Ultimate Edition - Modular assembly, duplicate scanner, content modules, quality flags
This commit is contained in:
224
frontend/src/pages/api/seo/assemble-article.ts
Normal file
224
frontend/src/pages/api/seo/assemble-article.ts
Normal file
@@ -0,0 +1,224 @@
|
||||
// @ts-ignore - Astro types available at build time
|
||||
import type { APIRoute } from 'astro';
|
||||
import { getDirectusClient, readItems, createItem, updateItem } from '@/lib/directus/client';
|
||||
import { replaceYearTokens } from '@/lib/seo/velocity-scheduler';
|
||||
|
||||
/**
|
||||
* Assemble Article API
|
||||
*
|
||||
* Builds a full article from content modules based on campaign recipe.
|
||||
* Uses lowest usage_count modules to ensure variety.
|
||||
*
|
||||
* POST /api/seo/assemble-article
|
||||
*/
|
||||
export const POST: APIRoute = async ({ request }: { request: Request }) => {
|
||||
try {
|
||||
const data = await request.json();
|
||||
const {
|
||||
campaign_id,
|
||||
location, // { city, state, county }
|
||||
publish_date,
|
||||
modified_date
|
||||
} = data;
|
||||
|
||||
if (!campaign_id || !location) {
|
||||
return new Response(
|
||||
JSON.stringify({ error: 'campaign_id and location required' }),
|
||||
{ status: 400, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
|
||||
const directus = getDirectusClient();
|
||||
|
||||
// Get campaign with recipe
|
||||
const campaigns = await directus.request(readItems('campaign_masters', {
|
||||
filter: { id: { _eq: campaign_id } },
|
||||
limit: 1
|
||||
})) as any[];
|
||||
|
||||
const campaign = campaigns[0];
|
||||
if (!campaign) {
|
||||
return new Response(
|
||||
JSON.stringify({ error: 'Campaign not found' }),
|
||||
{ status: 404, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
|
||||
const recipe = campaign.content_recipe || ['intro', 'benefits', 'howto', 'conclusion'];
|
||||
const pubDate = publish_date ? new Date(publish_date) : new Date();
|
||||
const modDate = modified_date ? new Date(modified_date) : new Date();
|
||||
|
||||
// Build context for token replacement
|
||||
const context = {
|
||||
city: location.city || '',
|
||||
state: location.state || '',
|
||||
county: location.county || '',
|
||||
state_code: getStateCode(location.state) || '',
|
||||
year: pubDate.getFullYear()
|
||||
};
|
||||
|
||||
// Fetch and assemble modules
|
||||
const assembledParts: string[] = [];
|
||||
const modulesUsed: string[] = [];
|
||||
|
||||
for (const moduleType of recipe) {
|
||||
// Get modules of this type, prefer lowest usage_count
|
||||
const modules = await directus.request(readItems('content_modules', {
|
||||
filter: {
|
||||
site: { _eq: campaign.site },
|
||||
module_type: { _eq: moduleType },
|
||||
is_active: { _eq: true }
|
||||
},
|
||||
sort: ['usage_count', 'id'], // Lowest usage first
|
||||
limit: 1
|
||||
})) as any[];
|
||||
|
||||
if (modules.length > 0) {
|
||||
const module = modules[0];
|
||||
|
||||
// Process spintax
|
||||
let content = module.content_spintax || '';
|
||||
|
||||
// Replace location tokens
|
||||
content = content
|
||||
.replace(/\{City\}/gi, context.city)
|
||||
.replace(/\{State\}/gi, context.state)
|
||||
.replace(/\{County\}/gi, context.county)
|
||||
.replace(/\{State_Code\}/gi, context.state_code)
|
||||
.replace(/\{Location_City\}/gi, context.city)
|
||||
.replace(/\{Location_State\}/gi, context.state);
|
||||
|
||||
// Replace year tokens
|
||||
content = replaceYearTokens(content, pubDate);
|
||||
|
||||
// Process spintax syntax
|
||||
content = processSpintax(content);
|
||||
|
||||
assembledParts.push(content);
|
||||
modulesUsed.push(module.id);
|
||||
|
||||
// Increment usage count
|
||||
await directus.request(
|
||||
updateItem('content_modules', module.id, {
|
||||
usage_count: (module.usage_count || 0) + 1
|
||||
})
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
const fullContent = assembledParts.join('\n\n');
|
||||
|
||||
// Generate headline from intro
|
||||
const headline = generateHeadline(campaign.spintax_title, context, pubDate) ||
|
||||
`${context.city} ${campaign.name || 'Guide'}`;
|
||||
|
||||
// Generate meta
|
||||
const metaTitle = headline.substring(0, 60);
|
||||
const metaDescription = stripHtml(fullContent).substring(0, 155) + '...';
|
||||
|
||||
// Count words
|
||||
const wordCount = stripHtml(fullContent).split(/\s+/).length;
|
||||
|
||||
// Create article
|
||||
const article = await directus.request(
|
||||
createItem('generated_articles', {
|
||||
site: campaign.site,
|
||||
campaign: campaign_id,
|
||||
headline: headline,
|
||||
meta_title: metaTitle,
|
||||
meta_description: metaDescription,
|
||||
full_html_body: fullContent,
|
||||
word_count: wordCount,
|
||||
is_published: false,
|
||||
is_test_batch: false,
|
||||
date_published: pubDate.toISOString(),
|
||||
date_modified: modDate.toISOString(),
|
||||
sitemap_status: 'ghost',
|
||||
location_city: context.city,
|
||||
location_county: context.county,
|
||||
location_state: context.state,
|
||||
modules_used: modulesUsed
|
||||
})
|
||||
) as any;
|
||||
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
success: true,
|
||||
article_id: article.id,
|
||||
headline,
|
||||
word_count: wordCount,
|
||||
modules_used: modulesUsed.length,
|
||||
dates: {
|
||||
published: pubDate.toISOString(),
|
||||
modified: modDate.toISOString()
|
||||
}
|
||||
}),
|
||||
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
} catch (error) {
|
||||
console.error('Error assembling article:', error);
|
||||
return new Response(
|
||||
JSON.stringify({ error: 'Failed to assemble article' }),
|
||||
{ status: 500, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Process spintax syntax: {option1|option2|option3}
|
||||
*/
|
||||
function processSpintax(text: string): string {
|
||||
// Match nested spintax from innermost to outermost
|
||||
let result = text;
|
||||
let maxIterations = 100;
|
||||
|
||||
while (result.includes('{') && maxIterations > 0) {
|
||||
result = result.replace(/\{([^{}]+)\}/g, (match, options) => {
|
||||
const choices = options.split('|');
|
||||
return choices[Math.floor(Math.random() * choices.length)];
|
||||
});
|
||||
maxIterations--;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate headline with spintax and tokens
|
||||
*/
|
||||
function generateHeadline(template: string | null, context: any, date: Date): string {
|
||||
if (!template) return '';
|
||||
|
||||
let headline = template
|
||||
.replace(/\{City\}/gi, context.city)
|
||||
.replace(/\{State\}/gi, context.state)
|
||||
.replace(/\{County\}/gi, context.county);
|
||||
|
||||
headline = replaceYearTokens(headline, date);
|
||||
headline = processSpintax(headline);
|
||||
|
||||
return headline;
|
||||
}
|
||||
|
||||
function stripHtml(html: string): string {
|
||||
return html.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
function getStateCode(state: string): string {
|
||||
const codes: Record<string, string> = {
|
||||
'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
|
||||
'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
|
||||
'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
|
||||
'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
|
||||
'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
|
||||
'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
|
||||
'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
|
||||
'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY',
|
||||
'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
|
||||
'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
|
||||
'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
|
||||
'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
|
||||
'Wisconsin': 'WI', 'Wyoming': 'WY'
|
||||
};
|
||||
return codes[state] || '';
|
||||
}
|
||||
290
frontend/src/pages/api/seo/process-queue.ts
Normal file
290
frontend/src/pages/api/seo/process-queue.ts
Normal file
@@ -0,0 +1,290 @@
|
||||
// @ts-ignore - Astro types available at build time
|
||||
import type { APIRoute } from 'astro';
|
||||
import { getDirectusClient, readItem, readItems, updateItem, createItem } from '@/lib/directus/client';
|
||||
import { replaceYearTokens } from '@/lib/seo/velocity-scheduler';
|
||||
|
||||
/**
|
||||
* Process Queue API
|
||||
*
|
||||
* Runs the factory: generates all scheduled articles for an approved queue.
|
||||
* Can be called by cron or manually (with limits per call).
|
||||
*
|
||||
* POST /api/seo/process-queue
|
||||
*/
|
||||
export const POST: APIRoute = async ({ request }: { request: Request }) => {
|
||||
try {
|
||||
const data = await request.json();
|
||||
const { queue_id, batch_limit = 100 } = data;
|
||||
|
||||
if (!queue_id) {
|
||||
return new Response(
|
||||
JSON.stringify({ error: 'queue_id is required' }),
|
||||
{ status: 400, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
|
||||
const directus = getDirectusClient();
|
||||
|
||||
// Get queue
|
||||
const queue = await directus.request(readItem('production_queue', queue_id)) as any;
|
||||
|
||||
if (!queue) {
|
||||
return new Response(
|
||||
JSON.stringify({ error: 'Queue not found' }),
|
||||
{ status: 404, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
|
||||
if (queue.status !== 'approved' && queue.status !== 'running') {
|
||||
return new Response(
|
||||
JSON.stringify({ error: 'Queue must be approved to process' }),
|
||||
{ status: 400, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
|
||||
// Mark as running
|
||||
await directus.request(
|
||||
updateItem('production_queue', queue_id, {
|
||||
status: 'running',
|
||||
started_at: queue.started_at || new Date().toISOString()
|
||||
})
|
||||
);
|
||||
|
||||
// Get campaign
|
||||
const campaign = await directus.request(
|
||||
readItem('campaign_masters', queue.campaign)
|
||||
) as any;
|
||||
|
||||
// Get schedule data
|
||||
const scheduleData = queue.schedule_data || [];
|
||||
const startIndex = queue.completed_count || 0;
|
||||
const endIndex = Math.min(startIndex + batch_limit, scheduleData.length);
|
||||
const batchSchedule = scheduleData.slice(startIndex, endIndex);
|
||||
|
||||
if (batchSchedule.length === 0) {
|
||||
// All done!
|
||||
await directus.request(
|
||||
updateItem('production_queue', queue_id, {
|
||||
status: 'done',
|
||||
completed_at: new Date().toISOString()
|
||||
})
|
||||
);
|
||||
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
success: true,
|
||||
message: 'Queue complete',
|
||||
total_generated: queue.completed_count
|
||||
}),
|
||||
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
|
||||
// Get locations based on filter
|
||||
const locationFilter = campaign.target_locations_filter || {};
|
||||
const locations = await directus.request(readItems('locations_cities', {
|
||||
filter: locationFilter,
|
||||
limit: batchSchedule.length,
|
||||
offset: startIndex
|
||||
})) as any[];
|
||||
|
||||
// Get recipe
|
||||
const recipe = campaign.content_recipe || ['intro', 'benefits', 'howto', 'conclusion'];
|
||||
|
||||
let generated = 0;
|
||||
const errors: string[] = [];
|
||||
|
||||
for (let i = 0; i < batchSchedule.length; i++) {
|
||||
const schedule = batchSchedule[i];
|
||||
const location = locations[i] || locations[i % locations.length];
|
||||
|
||||
if (!location) continue;
|
||||
|
||||
try {
|
||||
const pubDate = new Date(schedule.publish_date);
|
||||
const modDate = new Date(schedule.modified_date);
|
||||
|
||||
const context = {
|
||||
city: location.city || location.name || '',
|
||||
state: location.state || '',
|
||||
county: location.county || '',
|
||||
state_code: getStateCode(location.state) || ''
|
||||
};
|
||||
|
||||
// Assemble content from modules
|
||||
const { content, modulesUsed } = await assembleFromModules(
|
||||
directus, campaign.site, recipe, context, pubDate
|
||||
);
|
||||
|
||||
// Generate headline
|
||||
const headline = generateHeadline(campaign.spintax_title, context, pubDate) ||
|
||||
`${context.city} ${campaign.name || 'Guide'}`;
|
||||
|
||||
const wordCount = content.replace(/<[^>]*>/g, ' ').split(/\s+/).length;
|
||||
|
||||
// Create article
|
||||
await directus.request(
|
||||
createItem('generated_articles', {
|
||||
site: queue.site,
|
||||
campaign: campaign.id,
|
||||
headline: headline,
|
||||
meta_title: headline.substring(0, 60),
|
||||
meta_description: content.replace(/<[^>]*>/g, ' ').substring(0, 155) + '...',
|
||||
full_html_body: content,
|
||||
word_count: wordCount,
|
||||
is_published: true, // Ghost published
|
||||
is_test_batch: false,
|
||||
date_published: pubDate.toISOString(),
|
||||
date_modified: modDate.toISOString(),
|
||||
sitemap_status: 'ghost',
|
||||
location_city: context.city,
|
||||
location_county: context.county,
|
||||
location_state: context.state,
|
||||
modules_used: modulesUsed
|
||||
})
|
||||
);
|
||||
|
||||
generated++;
|
||||
} catch (err: any) {
|
||||
errors.push(`Article ${i}: ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Update queue progress
|
||||
const newCompleted = startIndex + generated;
|
||||
const isComplete = newCompleted >= scheduleData.length;
|
||||
|
||||
await directus.request(
|
||||
updateItem('production_queue', queue_id, {
|
||||
completed_count: newCompleted,
|
||||
status: isComplete ? 'done' : 'running',
|
||||
completed_at: isComplete ? new Date().toISOString() : null,
|
||||
error_log: errors.length > 0 ? errors.join('\n') : null
|
||||
})
|
||||
);
|
||||
|
||||
// Update site factory status
|
||||
await directus.request(
|
||||
updateItem('sites', queue.site, {
|
||||
factory_status: isComplete ? 'publishing' : 'generating'
|
||||
})
|
||||
);
|
||||
|
||||
// Log work
|
||||
await directus.request(
|
||||
createItem('work_log', {
|
||||
site: queue.site,
|
||||
action: 'batch_generated',
|
||||
entity_type: 'production_queue',
|
||||
entity_id: queue_id,
|
||||
details: {
|
||||
generated,
|
||||
errors: errors.length,
|
||||
progress: `${newCompleted}/${scheduleData.length}`
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
success: true,
|
||||
generated,
|
||||
errors: errors.length,
|
||||
progress: {
|
||||
completed: newCompleted,
|
||||
total: scheduleData.length,
|
||||
percent: Math.round((newCompleted / scheduleData.length) * 100)
|
||||
},
|
||||
status: isComplete ? 'done' : 'running',
|
||||
next_step: isComplete
|
||||
? 'Queue complete! Run sitemap-drip cron to start indexing.'
|
||||
: 'Call process-queue again to continue.'
|
||||
}),
|
||||
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
} catch (error) {
|
||||
console.error('Error processing queue:', error);
|
||||
return new Response(
|
||||
JSON.stringify({ error: 'Failed to process queue' }),
|
||||
{ status: 500, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
async function assembleFromModules(
|
||||
directus: any,
|
||||
siteId: string,
|
||||
recipe: string[],
|
||||
context: any,
|
||||
pubDate: Date
|
||||
): Promise<{ content: string; modulesUsed: string[] }> {
|
||||
const parts: string[] = [];
|
||||
const modulesUsed: string[] = [];
|
||||
|
||||
for (const moduleType of recipe) {
|
||||
const modules = await directus.request(readItems('content_modules', {
|
||||
filter: {
|
||||
site: { _eq: siteId },
|
||||
module_type: { _eq: moduleType },
|
||||
is_active: { _eq: true }
|
||||
},
|
||||
sort: ['usage_count'],
|
||||
limit: 1
|
||||
})) as any[];
|
||||
|
||||
if (modules.length > 0) {
|
||||
const mod = modules[0];
|
||||
let content = mod.content_spintax || '';
|
||||
|
||||
// Replace tokens
|
||||
content = content
|
||||
.replace(/\{City\}/gi, context.city)
|
||||
.replace(/\{State\}/gi, context.state)
|
||||
.replace(/\{County\}/gi, context.county)
|
||||
.replace(/\{State_Code\}/gi, context.state_code);
|
||||
|
||||
content = replaceYearTokens(content, pubDate);
|
||||
content = processSpintax(content);
|
||||
|
||||
parts.push(content);
|
||||
modulesUsed.push(mod.id);
|
||||
|
||||
// Increment usage
|
||||
await directus.request(updateItem('content_modules', mod.id, {
|
||||
usage_count: (mod.usage_count || 0) + 1
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
return { content: parts.join('\n\n'), modulesUsed };
|
||||
}
|
||||
|
||||
function processSpintax(text: string): string {
|
||||
let result = text;
|
||||
let iterations = 100;
|
||||
while (result.includes('{') && iterations > 0) {
|
||||
result = result.replace(/\{([^{}]+)\}/g, (_, opts) => {
|
||||
const choices = opts.split('|');
|
||||
return choices[Math.floor(Math.random() * choices.length)];
|
||||
});
|
||||
iterations--;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function generateHeadline(template: string | null, context: any, date: Date): string {
|
||||
if (!template) return '';
|
||||
let h = template
|
||||
.replace(/\{City\}/gi, context.city)
|
||||
.replace(/\{State\}/gi, context.state);
|
||||
h = replaceYearTokens(h, date);
|
||||
return processSpintax(h);
|
||||
}
|
||||
|
||||
function getStateCode(state: string): string {
|
||||
const codes: Record<string, string> = {
|
||||
'Florida': 'FL', 'Texas': 'TX', 'California': 'CA', 'New York': 'NY',
|
||||
'Arizona': 'AZ', 'Nevada': 'NV', 'Georgia': 'GA', 'North Carolina': 'NC'
|
||||
};
|
||||
return codes[state] || state?.substring(0, 2).toUpperCase() || '';
|
||||
}
|
||||
170
frontend/src/pages/api/seo/scan-duplicates.ts
Normal file
170
frontend/src/pages/api/seo/scan-duplicates.ts
Normal file
@@ -0,0 +1,170 @@
|
||||
// @ts-ignore - Astro types available at build time
|
||||
import type { APIRoute } from 'astro';
|
||||
import { getDirectusClient, readItems, createItem } from '@/lib/directus/client';
|
||||
|
||||
/**
|
||||
* Scan Duplicates API
|
||||
*
|
||||
* Uses shingle hashing to detect duplicate N-gram sequences across articles.
|
||||
* Flags any articles that share 7+ word sequences.
|
||||
*
|
||||
* POST /api/seo/scan-duplicates
|
||||
*/
|
||||
export const POST: APIRoute = async ({ request }: { request: Request }) => {
|
||||
try {
|
||||
const data = await request.json();
|
||||
const { queue_id, batch_ids, ngram_size = 7, threshold = 3 } = data;
|
||||
|
||||
if (!queue_id && !batch_ids) {
|
||||
return new Response(
|
||||
JSON.stringify({ error: 'queue_id or batch_ids required' }),
|
||||
{ status: 400, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
|
||||
const directus = getDirectusClient();
|
||||
|
||||
// Get articles to scan
|
||||
let articles: any[];
|
||||
if (batch_ids && Array.isArray(batch_ids)) {
|
||||
articles = await directus.request(readItems('generated_articles', {
|
||||
filter: { id: { _in: batch_ids } },
|
||||
fields: ['id', 'site', 'headline', 'full_html_body']
|
||||
})) as any[];
|
||||
} else {
|
||||
// Get test batch articles from queue
|
||||
articles = await directus.request(readItems('generated_articles', {
|
||||
filter: { is_test_batch: { _eq: true } },
|
||||
sort: ['-date_created'],
|
||||
limit: 20,
|
||||
fields: ['id', 'site', 'headline', 'full_html_body']
|
||||
})) as any[];
|
||||
}
|
||||
|
||||
if (articles.length < 2) {
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
success: true,
|
||||
message: 'Need at least 2 articles to compare',
|
||||
flags_created: 0
|
||||
}),
|
||||
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
|
||||
// Build shingle sets for each article
|
||||
const articleShingles: Map<string, Set<string>> = new Map();
|
||||
|
||||
for (const article of articles) {
|
||||
const text = stripHtml(article.full_html_body || '');
|
||||
const shingles = generateShingles(text, ngram_size);
|
||||
articleShingles.set(article.id, shingles);
|
||||
}
|
||||
|
||||
// Compare all pairs
|
||||
const collisions: Array<{
|
||||
articleA: string;
|
||||
articleB: string;
|
||||
sharedShingles: string[];
|
||||
similarity: number;
|
||||
}> = [];
|
||||
|
||||
const articleIds = Array.from(articleShingles.keys());
|
||||
|
||||
for (let i = 0; i < articleIds.length; i++) {
|
||||
for (let j = i + 1; j < articleIds.length; j++) {
|
||||
const idA = articleIds[i];
|
||||
const idB = articleIds[j];
|
||||
const setA = articleShingles.get(idA)!;
|
||||
const setB = articleShingles.get(idB)!;
|
||||
|
||||
// Find intersection
|
||||
const shared = [...setA].filter(s => setB.has(s));
|
||||
|
||||
if (shared.length >= threshold) {
|
||||
// Calculate Jaccard similarity
|
||||
const union = new Set([...setA, ...setB]);
|
||||
const similarity = (shared.length / union.size) * 100;
|
||||
|
||||
collisions.push({
|
||||
articleA: idA,
|
||||
articleB: idB,
|
||||
sharedShingles: shared.slice(0, 5), // Just first 5 examples
|
||||
similarity
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create quality flags for collisions
|
||||
const siteId = articles[0]?.site;
|
||||
let flagsCreated = 0;
|
||||
|
||||
for (const collision of collisions) {
|
||||
await directus.request(
|
||||
createItem('quality_flags', {
|
||||
site: siteId,
|
||||
batch_id: queue_id || null,
|
||||
article_a: collision.articleA,
|
||||
article_b: collision.articleB,
|
||||
collision_text: collision.sharedShingles.join(' | '),
|
||||
similarity_score: collision.similarity,
|
||||
status: 'pending'
|
||||
})
|
||||
);
|
||||
flagsCreated++;
|
||||
}
|
||||
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
success: true,
|
||||
articles_scanned: articles.length,
|
||||
collisions_found: collisions.length,
|
||||
flags_created: flagsCreated,
|
||||
details: collisions.map(c => ({
|
||||
article_a: c.articleA,
|
||||
article_b: c.articleB,
|
||||
similarity: c.similarity.toFixed(1) + '%',
|
||||
examples: c.sharedShingles
|
||||
}))
|
||||
}),
|
||||
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
} catch (error) {
|
||||
console.error('Error scanning duplicates:', error);
|
||||
return new Response(
|
||||
JSON.stringify({ error: 'Failed to scan duplicates' }),
|
||||
{ status: 500, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Strip HTML tags and normalize text
|
||||
*/
|
||||
function stripHtml(html: string): string {
|
||||
return html
|
||||
.replace(/<[^>]*>/g, ' ')
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim()
|
||||
.toLowerCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate N-gram shingles from text
|
||||
*/
|
||||
function generateShingles(text: string, n: number): Set<string> {
|
||||
const words = text.split(/\s+/).filter(w => w.length > 2);
|
||||
const shingles = new Set<string>();
|
||||
|
||||
for (let i = 0; i <= words.length - n; i++) {
|
||||
const shingle = words.slice(i, i + n).join(' ');
|
||||
shingles.add(shingle);
|
||||
}
|
||||
|
||||
return shingles;
|
||||
}
|
||||
Reference in New Issue
Block a user