-
Notifications
You must be signed in to change notification settings - Fork 44
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DO NOT MERGE #2736
base: main
Are you sure you want to change the base?
DO NOT MERGE #2736
Changes from 5 commits
fedba2b
b882a96
6236f1f
c4bcf99
5ef5ce2
66a0e99
7ec1624
6321502
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
import { DynamoDBClient, paginateScan } from '@aws-sdk/client-dynamodb' | ||
import { GetParameterCommand, SSMClient } from '@aws-sdk/client-ssm' | ||
import { BatchWriteCommand } from '@aws-sdk/lib-dynamodb' | ||
import { unmarshall } from '@aws-sdk/util-dynamodb' | ||
import chunk from 'lodash/chunk.js' | ||
|
||
// THIS CODE IS CUT AND PASTED FROM THE APP CODE IN /routes/circulars/circulars.lib.ts | ||
// running as a node script it would not import the module from the file | ||
const subjectMatchers = [ | ||
[/GRB[.\s_-]*(\d{6}[a-z|.]\d*)/i, ([, id]) => `GRB ${id.toUpperCase()}`], | ||
[/SGR[.\s_-]*(J*\d{4}\.?\d*\+\d{4})/i, ([, id]) => `SGR ${id.toUpperCase()}`], | ||
[ | ||
/SGR[.\s_-]*Swift[.\s_-]*(J*\d{4}\.?\d*\+\d{4})/i, | ||
([, id]) => `SGR Swift ${id.toUpperCase()}`, | ||
], | ||
[/IceCube[.\s_-]*(\d{6}[a-z])/i, ([, id]) => `IceCube-${id.toUpperCase()}`], | ||
[/ZTF[.\s_-]*(\d{2}[a-z]*)/i, ([, id]) => `ZTF${id.toLowerCase()}`], | ||
[/HAWC[.\s_-]*(\d{6}A)/i, ([, id]) => `HAWC-${id.toUpperCase()}`], | ||
[ | ||
/((?:LIGO|Virgo|KAGRA)(?:[/-](?:LIGO|Virgo|KAGRA))*)[-_ \s]?(S|G|GW)(\d{5,}[a-z]*)/i, | ||
([, instrument, flag, id]) => { | ||
return `${instrument} ${flag.toUpperCase()}${id.toLowerCase()}` | ||
}, | ||
], | ||
[/ANTARES[.\s_-]*(\d{6}[a-z])/i, ([, id]) => `ANTARES ${id}`.toUpperCase()], | ||
[ | ||
/Baksan\sNeutrino\sObservatory\sAlert[.\s_-]*(\d{6}.\d{2})/i, | ||
([, id]) => `Baksan Neutrino Observatory Alert ${id}`, | ||
], | ||
[/EP[.\s_-]*(\d{6}[a-z])/i, ([, id]) => `EP${id}`], | ||
[/FRB[.\s_-]*(\d{8}[a-z])/i, ([, id]) => `FRB ${id}`.toUpperCase()], | ||
] | ||
|
||
export function parseEventFromSubject(value) { | ||
for (const [regexp, normalize] of subjectMatchers) { | ||
const startsWithMatch = RegExp(`^${regexp.source}`).exec(value) | ||
if (startsWithMatch) return normalize(startsWithMatch) | ||
} | ||
for (const [regexp, normalize] of subjectMatchers) { | ||
const match = regexp.exec(value) | ||
if (match) return normalize(match) | ||
} | ||
} | ||
// END CUT AND PASTE OF APP CODE FROM /routes/circulars/circulars.lib.ts | ||
|
||
async function getTableNameFromSSM(dynamoTableName) { | ||
const ssmClient = new SSMClient({ region: 'us-east-1' }) | ||
|
||
try { | ||
const command = new GetParameterCommand({ Name: dynamoTableName }) | ||
const response = await ssmClient.send(command) | ||
|
||
if (!response.Parameter?.Value) { | ||
throw new Error('dynamoTableName not found in SSM') | ||
} | ||
|
||
return response.Parameter.Value | ||
} catch (error) { | ||
console.error('Error fetching table name from SSM:', error) | ||
throw error | ||
} | ||
} | ||
|
||
export async function backfillEventIds() { | ||
const startTime = new Date() | ||
console.log('Starting EVENT ID backfill...', startTime) | ||
const writeLimit = 1000 | ||
const dynamoTableName = '/RemixGcnProduction/tables/circulars' | ||
const TableName = await getTableNameFromSSM(dynamoTableName) | ||
const client = new DynamoDBClient({ region: 'us-east-1' }) | ||
const pages = paginateScan({ client }, { TableName }) | ||
let totalWriteCount = 0 | ||
let limitHit = false | ||
|
||
for await (const page of pages) { | ||
if (limitHit) break | ||
const chunked = chunk(page.Items || [], 25) | ||
|
||
for (const chunk of chunked) { | ||
if (limitHit) break | ||
let writes = [] | ||
|
||
for (const record of chunk) { | ||
if (limitHit) break | ||
|
||
const circular = unmarshall(record) | ||
const parsedEventId = parseEventFromSubject(circular.subject) | ||
|
||
if (!circular.eventId && parsedEventId) { | ||
circular.eventId = parsedEventId | ||
writes.push(circular) | ||
} | ||
} | ||
|
||
if (writes.length + totalWriteCount > writeLimit) { | ||
const overage = writes.length + totalWriteCount - writeLimit | ||
const writesToLimitTo = writes.length - overage | ||
writes = writes.slice(0, writesToLimitTo) | ||
limitHit = true | ||
} | ||
|
||
if (writes.length > 0) { | ||
const command = new BatchWriteCommand({ | ||
RequestItems: { | ||
[TableName]: writes.map((circ) => ({ | ||
PutRequest: { | ||
Item: { | ||
...circ, | ||
}, | ||
}, | ||
})), | ||
}, | ||
}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is quite complicated. I would be more comfortable with the correctness of a functional programming implementation using TC39 Async Iterator Helpers (see polyfills available from core-js). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this necessary to make this one-time use script functional? Does our other code comply with this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are a lot of loops and conditionals here, which makes the code too complicated to review quickly. As for whether we have existing code like this, I don't know of any other examples where we have a paginated input, which is filtered, and then which must be paginated again for insertion. That is the source of the complexity. |
||
|
||
await client.send(command) | ||
totalWriteCount = writes.length + totalWriteCount | ||
} | ||
} | ||
} | ||
const endTime = new Date() | ||
console.log('... End EVENT ID backfill... ', endTime) | ||
console.log('Total Event Ids Updated: ', totalWriteCount) | ||
} | ||
|
||
backfillEventIds() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not comfortable copying and pasting this. If you are not able to import it from a standalone Node.js script, then please convert this to a
.ts
source and use esbuild to compile it down to JS. You should be able to hack something into https://github.com/nasa-gcn/gcn.nasa.gov/blob/main/esbuild.config.js to do that.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you explain to me why? It's a literal cut and paste (though I will have to adapt the GRB matcher to include ones without a letter). Copying and pasting directly from the current app code and updating that subject matcher shouldn't be an issue. I'm not sure what compiling etc would buy us over a copy and paste. I also have the issue that I need to include eventIds on circulars that have GRBs without a letter, but they are not permissible anymore, so it shouldn't be added to the app subject matchers. I'm not sure what hacking around in the esbuild would get us that would be any better
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This script is standalone and will only be run once on prod. It also requires changes compared to the regular subject matchers to account for changes in GRB naming over time. If there are no problems with the logic of how this is being implemented, let's just go with how this is currently implemented.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For the same reasons why we try to avoid duplicate copies of all but the simplest code in general: there is the risk that the two copies will not be the same, that bugs in one won't be fixed in the other, that they will become incompatible, etc. Even though this PR contains code that we do not plan to reuse, there is still the very real risk that the new copy in the PR will not be the same as the original copy in the main source code.
We have added new regular expressions several times while this PR has been open. In order to review this properly, one would need to manually check that the two copies are the same in this PR, and then we would need to re-check it any time one makes changes to either this PR or to the original code on the main branch.
I do not think that this is an acceptable risk, but @jracusin should have the final say.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's fine. I can just check the subject matchers before I run them. I have both in my current version. I also have to adapt the GRB matcher so it includes GRBs without a letter in them. Currently the subject matchers will only match on GRBs with a letter. So there is already going to be a difference than what would be imported.