diff --git a/adsmp/app.py b/adsmp/app.py
index e80168b..8a41c63 100644
--- a/adsmp/app.py
+++ b/adsmp/app.py
@@ -2,12 +2,12 @@
from __future__ import absolute_import, unicode_literals
from past.builtins import basestring
from . import exceptions
-from adsmp.models import ChangeLog, IdentifierMapping, MetricsBase, MetricsModel, Records
+from adsmp.models import ChangeLog, IdentifierMapping, MetricsBase, MetricsModel, Records, SitemapInfo
from adsmsg import OrcidClaims, DenormalizedRecord, FulltextUpdate, MetricsRecord, NonBibRecord, NonBibRecordList, MetricsRecordList, AugmentAffiliationResponseRecord, AugmentAffiliationRequestRecord
from adsmsg.msg import Msg
from adsputils import ADSCelery, create_engine, sessionmaker, scoped_session, contextmanager
from sqlalchemy.orm import load_only as _load_only
-from sqlalchemy import Table, bindparam
+from sqlalchemy import Table, bindparam, func
import adsputils
import json
from adsmp import solr_updater
@@ -19,7 +19,10 @@
from copy import deepcopy
import sys
from sqlalchemy.dialects.postgresql import insert
-
+import os
+import pdb
+import gzip
+from itertools import chain
class ADSMasterPipelineCelery(ADSCelery):
@@ -92,6 +95,7 @@ def update_storage(self, bibcode, type, payload):
oldval = r.bib_data
r.bib_data = payload
r.bib_data_updated = now
+ self.populate_sitemap_table(r.bibcode, action='add')
elif type == 'nonbib_data':
oldval = r.nonbib_data
r.nonbib_data = payload
@@ -136,6 +140,11 @@ def delete_by_bibcode(self, bibcode):
session.delete(r)
session.commit()
return True
+ s = session.query(SitemapInfo).filter_by(bibcode=bibcode).first()
+ if s is not None:
+ session.delete(s)
+ session.commit()
+ return True
def rename_bibcode(self, old_bibcode, new_bibcode):
assert old_bibcode and new_bibcode
@@ -555,3 +564,209 @@ def generate_links_for_resolver(self, record):
# here if record holds unexpected value
self.logger.error('invalid value in bib data, bibcode = {}, type = {}, value = {}'.format(bibcode, type(bib_links_record), bib_links_record))
return resolver_record
+
+ def populate_sitemap_table(self, sitemap_record, sitemap_dir, action = 'add'):
+ """Populate the sitemap with the given record id and action.
+
+ :param bibcode: bibcode of record to be updated
+ :param action: string, either 'add' or 'remove'
+ """
+
+ if action not in ['add', 'remove', 'force-update', 'delete-table']:
+ self.logger.error("Invalid action %s, must be 'add', 'remove', 'force-update', 'delete-table'", action)
+ return
+
+ with self.session_scope() as session:
+ info = session.query(SitemapInfo).filter_by(record_id=sitemap_record['record_id']).first()
+
+ if info is None:
+ info = SitemapInfo(record_id=sitemap_record["record_id"])
+ session.add(info)
+
+ info.record_id = sitemap_record.get('record_id', None)
+ info.bibcode = sitemap_record.get('bibcode', None)
+ info.filename_lastmoddate = sitemap_record.get('filename_lastmoddate', None)
+ info.sitemap_filename = sitemap_record.get('sitemap_filename', None)
+ info.bib_data_updated = sitemap_record.get('bib_data_updated', None)
+
+ # This block is executed if a new record is being added to the sitemap table OR if the sitemap table is being forcefully updated
+ # Generate a new sitemap filename for the record
+ if action == 'force-update':
+ info.update_flag = True
+
+ if (info.sitemap_filename is None):
+ # get list of sitemap filenames from the SiteMapInfo table
+ sitemap_files = session.query(SitemapInfo.sitemap_filename).all()
+ sitemap_files = list(chain.from_iterable(sitemap_files))
+
+ if sitemap_files and any(sitemap_files):
+ # remove and None values from the list
+ sitemap_files = list(filter(lambda x: x is not None, sitemap_files))
+
+ # split all the filenames to get the index of the latest sitemap file
+ sitemap_file_indices = [int(sitemap_file.split('_bib_')[-1].split('.')[0]) for sitemap_file in sitemap_files]
+ latest_index = max(sitemap_file_indices)
+ sitemap_file_latest = 'sitemap_bib_{}.xml'.format(latest_index)
+
+ # Check the number of files assigned to sitemap_file_latest
+ sitemap_file_latest_count = session.query(SitemapInfo).filter_by(sitemap_filename=os.path.join(sitemap_dir,sitemap_file_latest)).count()
+ print(sitemap_file_latest)
+ print(sitemap_file_latest_count)
+
+ if sitemap_file_latest_count >= 3:
+ latest_index += 1
+ sitemap_filename = os.path.join(sitemap_dir, 'sitemap_bib_{}.xml'.format(latest_index))
+ else:
+ sitemap_filename = os.path.join(sitemap_dir, sitemap_file_latest)
+
+ info.sitemap_filename = sitemap_filename
+ info.update_flag = True
+ else:
+ info.sitemap_filename = os.path.join(sitemap_dir, 'sitemap_bib_1.xml')
+ info.update_flag = True
+ try:
+ session.commit()
+ return True
+ except exc.IntegrityError:
+ self.logger.exception('error')
+ session.rollback()
+ raise
+
+ #TODO: how to handle empty files in case of mass deletion?
+
+ def update_sitemap_files(self):
+ """Update the sitemap file with the given record id and action.
+
+ :param bibcode: bibcode of record to be updated
+ """
+ with self.session_scope() as session:
+ # Filter all records with update_flag = True and Count the number of records for each sitemap_filename
+ # SELECT sitemap_filename, count(*)
+ # FROM sitemap
+ # WHERE update_flag = True
+ # GROUP BY sitemap_filename
+ sitemapinfo = session.query(SitemapInfo.sitemap_filename, func.count('*').label('count')).filter(SitemapInfo.update_flag == True).group_by(SitemapInfo.sitemap_filename).all()
+ #unique sitemap filenames
+ sitemap_filenames = list(map(lambda item: item[0], sitemapinfo))
+
+ for file in sitemap_filenames:
+ fileinfo = session.query(SitemapInfo).filter_by(sitemap_filename=file)
+ newlines = []
+ for info in fileinfo:
+ # Update the sitemap file
+ info.filename_lastmoddate = adsputils.get_date()
+ newline = '\nhttps://ui.adsabs.harvard.edu/abs/{}/abstract{}'.format(info.bibcode, info.bib_data_updated.date())
+ newlines.append(newline)
+ info.update_flag = False
+ session.commit()
+
+ with open(file, 'w') as file:
+ file.write('')
+ file.write('\n')
+ for newline in newlines:
+ file.write(newline)
+
+ return
+
+ def get_sitemap_info(self, bibcode):
+ """Get sitemap info for given bibcode.
+
+ :param bibcode: bibcode of record to be updated
+ """
+ with self.session_scope() as session:
+ info = session.query(SitemapInfo).filter_by(bibcode=bibcode).first()
+ if info is None:
+ return None
+ return info.toJSON()
+
+ def delete_contents(self, table):
+ """Delete all contents of the table
+
+ :param table: string, name of the table
+ """
+ with self.session_scope() as session:
+ session.query(table).delete()
+ session.commit()
+
+ def backup_sitemap_files(self, directory):
+ """Backup the sitemap files to the given directory
+
+ :param dir: string, directory to backup the sitemap files
+ """
+
+ # move dir to /app/tmp
+ date = adsputils.get_date()
+ tmpdir = '/app/logs/tmp/sitemap_{}_{}_{}-{}/'.format(date.year, date.month, date.day, date.time())
+ os.system('mkdir -p {}'.format(tmpdir))
+ os.system('mv {}/* {}/'.format(directory, tmpdir))
+ return
+
+ def create_robot_txt_file(self, sitemap_dir, sitemap_url='https://ui.adsabs.harvard.edu/sitemap/'):
+ """Create a robots.txt file with the given sitemap url
+
+ :param sitemap_url: string, url of the sitemap
+ """
+ with open(sitemap_dir+'/robots.txt', 'w') as file:
+ file.write("Sitemap: "+sitemap_url+"/sitemap_index.xml")
+
+ file.write("\nUser-agent: *")
+ file.write("\nDisallow: /v1/")
+ file.write("\nDisallow: /resources")
+ file.write("\nDisallow: /core")
+ file.write("\nDisallow: /tugboat")
+ file.write("\nDisallow: /link_gateway/")
+ file.write("\nDisallow: /search/")
+ file.write("\nDisallow: /execute-query/")
+ file.write("\nDisallow: /status")
+
+ file.write("\n\nAllow: /help/")
+ file.write("\nAllow: /about/")
+ file.write("\nAllow: /blog/")
+ file.write("\nAllow: /abs/")
+
+ file.write("\n\nDisallow: /abs/*/citations")
+ file.write("\nDisallow: /abs/*/references")
+ file.write("\nDisallow: /abs/*/coreads")
+ file.write("\nDisallow: /abs/*/similar")
+ file.write("\nDisallow: /abs/*/toc")
+ file.write("\nDisallow: /abs/*/graphics")
+ file.write("\nDisallow: /abs/*/metrics")
+ file.write("\nDisallow: /abs/*/exportcitation")
+ return
+
+ def create_sitemap_index(self, sitemap_dir, sitemap_url='https://ui.adsabs.harvard.edu/sitemap/'):
+ """Create a sitemap index file with the given sitemap url
+
+ :param sitemap_url: string, url of the sitemap
+ """
+ with open(sitemap_dir+'/sitemap_index.xml', 'w') as file:
+ file.write('')
+ file.write('')
+ file.write('')
+ file.write('')
+ file.write('')
+ file.write('')
+ file.write('\n')
+
+ with self.session_scope() as session:
+ sitemapinfo = session.query(SitemapInfo.sitemap_filename,func.count('*').label('count')).group_by(
+ SitemapInfo.sitemap_filename).order_by(
+ func.min(SitemapInfo.id).asc()).all()
+
+ #unique sitemap filenames
+ sitemap_filenames = list(map(lambda item: item[0], sitemapinfo))
+
+ for filename in sitemap_filenames:
+ lastmoddate = session.query(SitemapInfo.filename_lastmoddate).filter_by(sitemap_filename=filename).first()
+ filename = filename.split('/')[-1]
+
+ try:
+ file.write('\n')
+ file.write('\n'+sitemap_url+filename+'')
+ file.write('\n'+str(lastmoddate[0].date())+'')
+ file.write('\n')
+ except:
+ self.logger.error('Error writing '+filename+' to sitemap_index.xml')
+ continue
+ file.write('\n')
+ return
diff --git a/adsmp/models.py b/adsmp/models.py
index b442175..12c5cc0 100644
--- a/adsmp/models.py
+++ b/adsmp/models.py
@@ -12,6 +12,7 @@
from sqlalchemy.dialects import postgresql
from sqlalchemy import text
import json
+from sqlalchemy import ForeignKey
Base = declarative_base()
MetricsBase = declarative_base()
@@ -143,6 +144,43 @@ class IdentifierMapping(Base):
def toJSON(self):
return {'key': self.key, 'target': self.target}
+class SitemapInfo(Base):
+ """
+ Maps each bibcode to a sitemap file and tracks the last modification date of the file.
+
+ Attributes:
+ id (int): The unique identifier for the sitemap.
+ record_id (int): The foreign key referencing the associated record.
+ sitemap_filename (str): The filename of the sitemap.
+ filename_lastmoddate (datetime): The last modification date of the filename.
+ update_flag (bool): A flag indicating whether the sitemap needs to be updated.
+ """
+
+ __tablename__ = 'sitemap'
+
+ id = Column(Integer, primary_key=True) #bigint
+ record_id = Column(Integer, ForeignKey('records.id')) #TODO: Should foreign key be used for a primary key?
+ bibcode = Column(String(255))
+ bib_data_updated = Column(UTCDateTime, default=None)
+ sitemap_filename = Column(String(255))
+ filename_lastmoddate = Column(UTCDateTime, default=get_date)
+ update_flag = Column(Boolean, default=False)
+
+ def toJSON(self):
+ """
+ Converts the SitemapInfo object to a JSON representation.
+
+ Returns:
+ dict: A dictionary containing the JSON representation of the object.
+ """
+ return {
+ 'id': self.id,
+ 'record_id': self.record_id,
+ 'bibcode': self.bibcode,
+ 'sitemap_filename': self.sitemap_filename,
+ 'filename_lastmoddate': self.filename_lastmoddate,
+ 'update_flag': self.update_flag,
+ }
## This definition is copied directly from: https://github.com/adsabs/metrics_service/blob/master/service/models.py
## We need to have it when we are sending/writing data into the metrics database
diff --git a/adsmp/tasks.py b/adsmp/tasks.py
index 29216fc..20c0bc2 100644
--- a/adsmp/tasks.py
+++ b/adsmp/tasks.py
@@ -7,7 +7,10 @@
from adsmp import solr_updater
from kombu import Queue
from adsmsg.msg import Msg
-
+from sqlalchemy import create_engine, MetaData, Table
+from sqlalchemy.orm import sessionmaker
+from adsmp.models import SitemapInfo
+import pdb
# ============================= INITIALIZATION ==================================== #
proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../'))
@@ -22,6 +25,7 @@
Queue('index-solr', app.exchange, routing_key='index-solr'),
Queue('index-metrics', app.exchange, routing_key='index-metrics'),
Queue('index-data-links-resolver', app.exchange, routing_key='index-data-links-resolver'),
+ Queue('generate-sitemap', app.exchange, routing_key='generate-sitemap'),
)
@@ -304,6 +308,82 @@ def task_delete_documents(bibcode):
else:
logger.debug('Failed to deleted metrics record: %s', bibcode)
+@app.task(queue='populate-sitemap-table')
+def task_populate_sitemap_table(bibcodes, action = 'add', sitemap_dir='/app/logs/sitemap/'):
+ """
+ Populate the sitemap table for the given bibcodes
+ """
+
+ if action not in ['add', 'remove', 'force-update', 'delete-table']:
+ logger.error("Invalid action %s, must be 'add', 'remove', 'force-update', 'delete-table'", action)
+ return
+
+ if action == 'delete-table':
+ # reset and empty all entries in sitemap table
+ app.delete_contents(SitemapInfo)
+
+ # move all sitemap files to a backup directory
+ app.backup_sitemap_files(sitemap_dir)
+ return
+
+ #TODO: update records table schema to accommodate foreign key (in alembic)
+ if isinstance(bibcodes, basestring):
+ bibcodes = [bibcodes]
+
+ logger.debug('Updating sitemap info for: %s', bibcodes)
+ fields = ['id', 'bibcode', 'bib_data_updated']
+
+ sitemap_records = []
+ # update all record_id from records table into sitemap table
+ for bibcode in bibcodes:
+ r = app.get_record(bibcode, load_only=fields)
+ s = app.get_sitemap_info(bibcode)
+
+ if r is None:
+ logger.error('The bibcode %s doesn\'t exist!', bibcode)
+ continue
+
+ if s is None: #sitemap record does not exist
+ bib_data_updated = r.get('bib_data_updated', None)
+
+ sitemap_record = {
+ 'record_id': r.get('id', None),
+ 'bibcode': r.get('bibcode', None),
+ 'bib_data_updated': bib_data_updated,
+ 'filename_lastmoddate': None,
+ 'sitemap_filename': None,
+ }
+ # Insert sitemap_records into sitemap table
+ app.populate_sitemap_table(sitemap_record, sitemap_dir, action='force-update')
+ sitemap_records.append(sitemap_record)
+
+ else:
+ bib_data_updated = r.get('bib_data_updated', None)
+ filename_lastmoddate = s.get('filename_lastmoddate', None)
+ s['bib_data_updated'] = bib_data_updated
+
+ if bib_data_updated and filename_lastmoddate:
+ if (bib_data_updated > filename_lastmoddate):
+ s['update_flag'] = True
+
+ if (action == 'force-update'):
+ s['update_flag'] = True
+
+ app.populate_sitemap_table(s, sitemap_dir, action=action)
+
+
+@app.task(queue='update-sitemap-files')
+def task_update_sitemap_files(sitemap_dir='/app/logs/sitemap/'):
+
+ # Update sitemap files for records which have update_flag set to True
+ app.update_sitemap_files()
+ app.create_robot_txt_file(sitemap_dir)
+ app.create_sitemap_index(sitemap_dir)
+
+ # directory names : about, help, blog
+ # need to query github to find when above dirs are updated: https://docs.github.com/en/rest?apiVersion=2022-11-28
+ # need to generate an API token from ADStailor (maybe)
+ #OTHER -- do this for ADS and SciX
if __name__ == '__main__':
app.start()
diff --git a/alembic/env.py b/alembic/env.py
index 3b25928..67cafe1 100644
--- a/alembic/env.py
+++ b/alembic/env.py
@@ -16,9 +16,9 @@
# add your model's MetaData object here
# for 'autogenerate' support
-# from myapp import mymodel
-# target_metadata = mymodel.Base.metadata
-target_metadata = None
+from adsmp import models
+target_metadata = models.Base.metadata
+# target_metadata = None
# other values from the config, defined by the needs of env.py,
# can be acquired:
diff --git a/alembic/versions/ae542e1ad8eb_added_sitemap_table.py b/alembic/versions/ae542e1ad8eb_added_sitemap_table.py
new file mode 100644
index 0000000..2e88cdc
--- /dev/null
+++ b/alembic/versions/ae542e1ad8eb_added_sitemap_table.py
@@ -0,0 +1,42 @@
+"""Added sitemap table
+
+Revision ID: ae542e1ad8eb
+Revises: 2d2af8a9c996
+Create Date: 2024-05-22 19:44:23.053421
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = 'ae542e1ad8eb'
+down_revision = '2d2af8a9c996'
+
+from alembic import op
+import sqlalchemy as sa
+import adsmp
+
+
+def upgrade():
+ #with app.app_context() as c:
+ # db.session.add(Model())
+ # db.session.commit()
+
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.create_table('sitemap',
+ sa.Column('id', sa.Integer(), nullable=False),
+ sa.Column('record_id', sa.Integer(), nullable=True),
+ sa.Column('bibcode', sa.String(length=255), nullable=True),
+ sa.Column('bib_data_updated', adsmp.models.UTCDateTime(), nullable=True),
+ sa.Column('sitemap_filename', sa.String(length=255), nullable=True),
+ sa.Column('filename_lastmoddate', adsmp.models.UTCDateTime(), nullable=True),
+ sa.Column('update_flag', sa.Boolean(), nullable=True),
+ sa.ForeignKeyConstraint(['record_id'], ['records.id'], ),
+ sa.PrimaryKeyConstraint('id')
+ )
+
+ # ### end Alembic commands ###
+
+
+def downgrade():
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.drop_table('sitemap')
+ # ### end Alembic commands ###
diff --git a/config.py b/config.py
index 0ea4294..ce2b863 100644
--- a/config.py
+++ b/config.py
@@ -2,7 +2,7 @@
# serves as a running log of claims and storage of author-related
# information). It is not consumed by others (ie. we 'push' results)
# SQLALCHEMY_URL = 'postgres://docker:docker@localhost:6432/docker'
-SQLALCHEMY_URL = "sqlite:///"
+SQLALCHEMY_URL = "postgresql://master_pipeline:master_pipeline@ingest_pipeline_test_db:5432/master_pipeline"
SQLALCHEMY_ECHO = False
diff --git a/run.py b/run.py
index 042ad14..e70e7db 100755
--- a/run.py
+++ b/run.py
@@ -16,7 +16,7 @@
from urlparse import urlparse
from adsputils import setup_logging, get_date, load_config
-from adsmp.models import KeyValue, Records
+from adsmp.models import KeyValue, Records, SitemapInfo
from adsmp import tasks, solr_updater, validate
from sqlalchemy.orm import load_only
from sqlalchemy.orm.attributes import InstrumentedAttribute
@@ -399,6 +399,20 @@ def reindex_failed_bibcodes(app, update_processed=True):
bibs = []
logger.info('Done reindexing %s previously failed bibcodes', count)
+def populate_sitemap_table(bibcodes, action = 'add'):
+ """
+ actions: 'add': add/update record info to sitemap table if bibdata_updated is newer than filename_lastmoddate
+ 'delete-table': delete all contents of sitemap table
+ 'force-update': force update sitemap table entries for given bibcodes
+ """
+ # tasks.task_generate_sitemap(bibcodes, action = 'delete-table')
+ tasks.task_populate_sitemap_table(bibcodes, action = action)
+
+def update_sitemap_files():
+ """
+ Update all sitemap files for records with update_flag = True
+ """
+ tasks.task_update_sitemap_files()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process user input.')
@@ -524,6 +538,20 @@ def reindex_failed_bibcodes(app, update_processed=True):
default=False,
dest='update_processed',
help='update processed timestamps and other state info in records table when a record is indexed')
+ parser.add_argument('--populate-sitemap-table',
+ action='store_true',
+ default=False,
+ dest='populate_sitemap_table',
+ help='populate sitemap table for list of bibcode')
+ parser.add_argument('--action',
+ default=False,
+ choices=['add', 'delete-table', 'force-update', 'remove'],
+ help='action for populate sitemap table function')
+ parser.add_argument('--update-sitemap-files',
+ action='store_true',
+ default=False,
+ dest='update_sitemap_files',
+ help='update sitemap files for records with update_flag = True in sitemap table')
args = parser.parse_args()
@@ -594,6 +622,25 @@ def reindex_failed_bibcodes(app, update_processed=True):
rebuild_collection(args.solr_collection, args.batch_size)
elif args.index_failed:
reindex_failed_bibcodes(app, args.update_processed)
+ elif args.populate_sitemap_table:
+ if args.filename:
+ bibs = []
+ with open(args.filename) as f:
+ for line in f:
+ bibcode = line.strip()
+ if bibcode:
+ bibs.append(bibcode)
+ elif args.bibcodes:
+ bibs = args.bibcodes
+ if args.action:
+ action = args.action
+ else:
+ action = 'add'
+
+ #TODO: make async?
+ populate_sitemap_table(bibs, action = action)
+ elif args.update_sitemap_files:
+ update_sitemap_files()
elif args.reindex:
update_solr = 's' in args.reindex.lower()
update_metrics = 'm' in args.reindex.lower()
@@ -647,6 +694,3 @@ def reindex_failed_bibcodes(app, update_processed=True):
update_solr=update_solr, update_metrics=update_metrics,
update_links=update_links, force_processing=args.force_processing, ignore_checksums=args.ignore_checksums,
solr_targets=solr_urls, update_processed=args.update_processed, priority=args.priority)
-
-
-