Skip to content

Commit

Permalink
urls: fix wikipedia url detection with parenthesis
Browse files Browse the repository at this point in the history
Fixes: f0df4aa ("Strip common punctuations from URLs")
Fixes: #1027
Closes: #1063
Changelog-Fixed: Fix wikipedia url detection with parenthesis
  • Loading branch information
jb55 committed Aug 6, 2023
1 parent 85930df commit 53e9269
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 10 deletions.
6 changes: 1 addition & 5 deletions damus-c/cursor.h
Original file line number Diff line number Diff line change
Expand Up @@ -447,12 +447,8 @@ static inline int is_left_boundary(char c) {
return is_right_boundary(c) || is_utf8_byte(c);
}

static inline int is_invalid_url_ending(char c) {
return c == '!' || c == '?' || c == ')' || c == '.' || c == ',' || c == ';';
}

static inline int is_alphanumeric(char c) {
return (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
}

static inline int consume_until_boundary(struct cursor *cur) {
Expand Down
73 changes: 68 additions & 5 deletions damus-c/damus.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,69 @@ static int add_text_block(struct note_blocks *blocks, const u8 *start, const u8
return add_block(blocks, b);
}

static int consume_url_fragment(struct cursor *cur)
{
int c;

if ((c = peek_char(cur, 0)) < 0)
return 1;

if (c != '#' && c != '?') {
return 1;
}

cur->p++;

return consume_until_whitespace(cur, 1);
}

static int consume_url_path(struct cursor *cur)
{
int c;

if ((c = peek_char(cur, 0)) < 0)
return 1;

if (c != '/') {
return 1;
}

while (cur->p < cur->end) {
c = *cur->p;

if (c == '?' || c == '#' || is_whitespace(c)) {
return 1;
}

cur->p++;
}

return 1;
}

static int consume_url_host(struct cursor *cur)
{
char c;
int count = 0;

while (cur->p < cur->end) {
c = *cur->p;
// TODO: handle IDNs
if (is_alphanumeric(c) || c == '.' || c == '-')
{
count++;
cur->p++;
continue;
}

return count != 0;
}


// this means the end of the URL hostname is the end of the buffer and we finished
return count != 0;
}

static int parse_url(struct cursor *cur, struct note_block *block) {
u8 *start = cur->p;

Expand All @@ -121,15 +184,15 @@ static int parse_url(struct cursor *cur, struct note_block *block) {
return 0;
}
}

if (!consume_until_whitespace(cur, 1)) {

if (!(consume_url_host(cur) &&
consume_url_path(cur) &&
consume_url_fragment(cur)))
{
cur->p = start;
return 0;
}

// strip any unwanted characters
while(is_invalid_url_ending(peek_char(cur, -1))) cur->p--;

block->type = BLOCK_URL;
block->block.str.start = (const char *)start;
block->block.str.end = (const char *)cur->p;
Expand Down

0 comments on commit 53e9269

Please sign in to comment.