Skip to content

Commit

Permalink
Restore functional URL rewriting
Browse files Browse the repository at this point in the history
  • Loading branch information
adamziel committed Nov 18, 2024
1 parent bf4b53d commit 9c01981
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 68 deletions.
17 changes: 8 additions & 9 deletions packages/playground/data-liberation/plugin.php
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,14 @@
if('markdown' === $mode) {
$docs_root = __DIR__ . '/../../docs/site';
$docs_content_root = $docs_root . '/docs';
$markdown_entities_factory = function() use ($docs_content_root) {
$reader = new WP_Markdown_Directory_Tree_Reader(
$entity_iterator_factory = function() use ($docs_content_root) {
return new WP_Markdown_Directory_Tree_Reader(
$docs_content_root,
1000
);
return $reader->generator();
};
$markdown_importer = WP_Markdown_Importer::create(
$markdown_entities_factory, [
$entity_iterator_factory, [
'source_site_url' => 'file://' . $docs_content_root,
'local_markdown_assets_root' => $docs_root,
'local_markdown_assets_url_prefix' => '@site/',
Expand All @@ -68,13 +67,13 @@
$markdown_importer->frontload_assets();
$markdown_importer->import_posts();
} else {
$wxr_entities_factory = function() use ($wxr_path) {
return WP_WXR_Reader::stream_from(
new WP_File_Reader($wxr_path)
);
$entity_iterator_factory = function() use ($wxr_path) {
$wxr = new WP_WXR_Reader();
$wxr->connect_upstream(new WP_File_Reader($wxr_path));
return $wxr;
};
$wxr_importer = WP_Stream_Importer::create(
$wxr_entities_factory
$entity_iterator_factory
);
$wxr_importer->frontload_assets();
$wxr_importer->import_posts();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,50 +208,30 @@ public function set_raw_url( $new_url ) {
* relative URLs in text nodes. On the other hand, the detection is performed
* by this WP_URL_In_Text_Processor class so maybe the two do go hand in hand?
*/
function rewrite_url_components( URL $to_url ) {
$from_url = url_matches( $this->get_parsed_url(), $this->base_url_string ) ? $this->base_url_object : $this->get_parsed_url();

function replace_base_url( URL $to_url ) {
$updated_url = clone $this->get_parsed_url();

$updated_url->hostname = $to_url->hostname;
$updated_url->protocol = $to_url->protocol;
$updated_url->port = $to_url->port;

// Update the pathname if needed.
$from_url = $this->get_parsed_url();
$from_pathname = $from_url->pathname;
$to_pathname = $to_url->pathname;
if ( $from_pathname !== $to_pathname ) {
if ( $from_pathname[ strlen( $from_pathname ) - 1 ] === '/' ) {
$from_pathname = substr( $from_pathname, 0, strlen( $from_pathname ) - 1 );
}
$from_pathname_with_trailing_slash = $from_pathname === '/' ? $from_pathname : $from_pathname . '/';

$decoded_matched_pathname = urldecode_n(
$updated_url->pathname,
strlen( $from_pathname_with_trailing_slash )
if ( $this->base_url_object->pathname !== $to_pathname ) {
$base_pathname_with_trailing_slash = rtrim( $this->base_url_object->pathname, '/' ) . '/';
$decoded_matched_pathname = urldecode_n(
$from_pathname,
strlen( $base_pathname_with_trailing_slash )
);
/**
* If there's nothing to carry over from the original pathname,
* use the rewritten pathname as is.
*
* @TODO: Document this behavior in a human-readable way.
*/
if ( strlen( $decoded_matched_pathname ) >= strlen( $from_pathname_with_trailing_slash ) ) {
$updated_url->pathname = $to_pathname;
} else {
// Otherwise, add a trailing slash to the target pathname part and
// carry over the rest from the original pathname.
if ( $to_pathname[ strlen( $to_pathname ) - 1 ] === '/' ) {
$to_pathname = substr( $to_pathname, 0, strlen( $to_pathname ) - 1 );
}
$to_pathname_with_trailing_slash = $to_pathname === '/' ? $to_pathname : $to_pathname . '/';
$updated_url->pathname =
$to_pathname_with_trailing_slash .
substr(
$decoded_matched_pathname,
strlen( $from_pathname_with_trailing_slash )
);
}
$to_pathname_with_trailing_slash = rtrim( $to_pathname, '/' ) . '/';
$updated_url->pathname =
$to_pathname_with_trailing_slash .
substr(
$decoded_matched_pathname,
strlen( $base_pathname_with_trailing_slash )
);
}

/*
Expand All @@ -262,10 +242,10 @@ function rewrite_url_components( URL $to_url ) {
*/
$new_raw_url = $updated_url->toString();
if (
$updated_url->pathname[ strlen( $updated_url->pathname ) - 1 ] !== '/' &&
$updated_url->pathname === '/' &&
$updated_url->search === '' &&
$updated_url->hash === ''
$from_url->pathname[ strlen( $from_url->pathname ) - 1 ] !== '/' &&
$from_url->pathname !== '/' &&
$from_url->search === '' &&
$from_url->hash === ''
) {
$new_raw_url = rtrim( $new_raw_url, '/' );
}
Expand All @@ -282,8 +262,8 @@ function rewrite_url_components( URL $to_url ) {
// place to place this logic. Perhaps this *method* could be
// decoupled into two separate *functions*?
$this->get_token_type() !== '#text' &&
! str_starts_with( $new_raw_url, 'http://' ) &&
! str_starts_with( $new_raw_url, 'https://' )
! str_starts_with( $this->get_raw_url(), 'http://' ) &&
! str_starts_with( $this->get_raw_url(), 'https://' )
);
if ( ! $is_relative ) {
$this->set_raw_url( $new_raw_url );
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class WP_Remote_File_Reader implements WP_Byte_Reader {
private $request;
private $current_chunk;
private $last_error;
private $is_finished;
private $is_finished = false;
private $bytes_already_read;
private $skip_bytes = 0;

Expand Down
15 changes: 10 additions & 5 deletions packages/playground/data-liberation/src/functions.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ function wp_rewrite_urls( $options ) {
$parsed_url = $p->get_parsed_url();
foreach ( $url_mapping as $mapping ) {
if ( url_matches( $parsed_url, $mapping['from_url'] ) ) {
$p->rewrite_url_components( $mapping['from_url'], $mapping['to_url'] );
$p->replace_base_url( $mapping['to_url'] );
break;
}
}
Expand Down Expand Up @@ -83,10 +83,10 @@ function url_matches( URL $subject, $from_url ) {
* '"is 6 %3C 6?%22 – asked Achilles' because only the first encoded byte is decoded.
*
* @param string $string The string to decode.
* @param int $target_length The maximum length of the resulting string.
* @param int $decode_n The number of bytes to decode in $input
* @return string The decoded string.
*/
function urldecode_n( $input, $target_length ) {
function urldecode_n( $input, $decode_n ) {
$result = '';
$at = 0;
while ( true ) {
Expand All @@ -100,23 +100,28 @@ function urldecode_n( $input, $target_length ) {
$result .= substr( $input, $last_at, $at - $last_at );

// If we've already decoded the requested number of bytes, stop.
if ( strlen( $result ) >= $target_length ) {
if ( strlen( $result ) >= $decode_n ) {
break;
}

++$at;
if ( $at > strlen( $input ) ) {
break;
}

$decodable_length = strspn(
$input,
'0123456789ABCDEFabcdef',
$at,
2
);

if ( $decodable_length === 2 ) {
// Decode the hex sequence.
$result .= chr( hexdec( $input[ $at ] . $input[ $at + 1 ] ) );
$at += 2;
} else {
// Consume the percent sign and move on.
// Consume the next byte and move on.
$result .= '%';
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ public function import_posts() {
if ( $this->url_processor_matched_asset_url( $p ) ) {
$filename = $this->new_asset_filename( $p->get_raw_url() );
$new_asset_url = $this->options['uploads_url'] . '/' . $filename;
$p->rewrite_url_components( WP_URL::parse( $new_asset_url ) );
$p->replace_base_url( WP_URL::parse( $new_asset_url ) );
$attachments[] = $new_asset_url;
/**
* @TODO: How would we know a specific image block refers to a specific
Expand All @@ -170,7 +170,7 @@ public function import_posts() {
$p->get_parsed_url() &&
url_matches( $p->get_parsed_url(), $this->source_site_url )
) {
$p->rewrite_url_components( WP_URL::parse( $this->options['new_site_url'] ) );
$p->replace_base_url( WP_URL::parse( $this->options['new_site_url'] ) );
} else {
// Ignore other URLs.
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,21 @@
* data importing pipeline that's not specific to a single input format.
*/

class WP_Markdown_Directory_Tree_Reader {
class WP_Markdown_Directory_Tree_Reader implements Iterator {
private $file_visitor;
private $entity;

private $pending_directory_index;
private $pending_files = array();
private $parent_ids = array();
private $next_post_id;
private $is_finished = false;

public function __construct( $root_dir, $first_post_id ) {
$this->file_visitor = new WP_File_Visitor( realpath( $root_dir ) );
$this->next_post_id = $first_post_id;
}

public function generator() {
while ( true ) {
if ( false === $this->next_entity() ) {
break;
}
yield $this->entity;
}
}

public function next_entity() {
while ( true ) {
if ( null !== $this->pending_directory_index ) {
Expand Down Expand Up @@ -81,6 +73,7 @@ public function next_entity() {
break;
}
}
$this->is_finished = true;
return false;
}

Expand Down Expand Up @@ -261,4 +254,24 @@ private function remove_first_h1_block_from_block_markup( $html ) {
),
);
}

public function current(): object {
return $this->get_entity();
}

public function next(): void {
$this->next_entity();
}

public function key(): int {
return 0;
}

public function valid(): bool {
return ! $this->is_finished;
}

public function rewind(): void {
// noop
}
}
25 changes: 24 additions & 1 deletion packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@
*
* @since WP_VERSION
*/
class WP_WXR_Reader {
class WP_WXR_Reader implements Iterator {

/**
* The XML processor used to parse the WXR file.
Expand Down Expand Up @@ -863,4 +863,27 @@ private function after_entity() {
$this->last_opener_attributes = array();
$this->entity_byte_offset = null;
}

public function current(): object {
return $this->get_entity();
}

private $last_next_result = null;
public function next(): void {
// @TODO: Don't keep track of this. Just make sure the next_entity()
// call will make the is_finished() true.
$this->last_next_result = $this->next_entity();
}

public function key(): int {
return 0;
}

public function valid(): bool {
return false !== $this->last_next_result && ! $this->is_finished() && ! $this->get_last_error();
}

public function rewind(): void {
// noop
}
}

0 comments on commit 9c01981

Please sign in to comment.