Skip to content

Commit

Permalink
Merge pull request #156 from nih-sparc/update-crawl-indexing-settings
Browse files Browse the repository at this point in the history
Updated settings for crawlers
  • Loading branch information
egauzens authored Jul 10, 2024
2 parents da77be6 + 1d9f7f6 commit ac8a63a
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 3 deletions.
6 changes: 3 additions & 3 deletions nuxt.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ export default defineNuxtConfig({
{ hid: 'og:image:secure_url', property: 'og:image:secure_url',
content: 'https://images.ctfassets.net/6bya4tyw8399/7r5WTb92QnHkub8RsExuc1/2ac134de2ddfd65eb6316421df7578f9/sparc-logo-primary.png'
},
{ hid: 'robots', name: 'robots', content: 'max-snippet:-1, max-image-preview:large, max-video-preview:-1, crawl-delay:3600' },
{ hid: 'og:url', property: 'og:url', content: process.env.ROOT_URL || 'sparc.science' },
{ hid: 't-type', name: 'twitter:card', content: 'summary_large_image' },
{ name: 'twitter:site', content: '@sparc_science' },
Expand Down Expand Up @@ -210,20 +211,19 @@ export default defineNuxtConfig({
},
robots: {
sitemap: 'https://sparc.science/sitemap.xml',
allow: ['/datasets'],
// provide simple disallow rules for all robots `user-agent: *`
// disallowing certain pages that are either redirects, authticated routes, or causing bots to recursively crawl
disallow: [
'/welcome',
'/user',
'/data',
'/contact-us',
'/help',
'/signup',
'/maps',
'/news-and-events/submit',
'/news-and-events/community-spotlight/submit'
],
blockNonSeoBots: true
blockNonSeoBots: true,
crawlDelay: 3600
}
})
22 changes: 22 additions & 0 deletions server/middleware/webCrawlers.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
export default defineEventHandler((event) => {
const req = event?.node?.req
const res = event?.node?.res
const route = event?.node?.url
const userAgent = req?.headers['user-agent']?.toLowerCase()
const googlebotPaths = ['data', 'datasets']
// Remove leading '/' if present
let firstPartOfRoute = route?.length > 0 ? route.replace(/^\//, '') : ''
const index = firstPartOfRoute.indexOf('/')
if (index !== -1) {
// Grab the first directory
firstPartOfRoute = firstPartOfRoute.substring(0, index);
}

if (firstPartOfRoute == '' || googlebotPaths.includes(firstPartOfRoute)) {
if (userAgent && (userAgent.includes('googlebot') || userAgent.includes("google.com/bot.html"))) {
res.statusCode = 200
res.setHeader('Content-Type', 'text/plain')
res.end('Googlebot detected, serving empty response.')
}
}
})

0 comments on commit ac8a63a

Please sign in to comment.