diff --git a/packages/playground/data-liberation/plugin.php b/packages/playground/data-liberation/plugin.php index ecf88569fe..3a0f15f7a4 100644 --- a/packages/playground/data-liberation/plugin.php +++ b/packages/playground/data-liberation/plugin.php @@ -8,13 +8,13 @@ /** * Don't run KSES on the attribute values during the import. - * + * * Without this filter, WP_HTML_Tag_Processor::set_attribute() will * assume the value is a URL and run KSES on it, which will incorrectly * prefix relative paths with http://. - * + * * For example: - * + * * > $html = new WP_HTML_Tag_Processor( '' ); * > $html->next_tag(); * > $html->set_attribute( 'src', './_assets/log-errors.png' ); @@ -25,6 +25,41 @@ return []; }); +/** + * Development debug code to run the import manually. + * @TODO: Remove this in favor of a CLI command. + */ +add_action('init', function() { + return; + $wxr_path = __DIR__ . '/tests/fixtures/wxr-simple.xml'; + $importer = WP_Stream_Importer::create_for_wxr_file( + $wxr_path + ); + while($importer->next_step()) { + // ... + } + return; + $importer->next_step(); + $paused_importer_state = $importer->get_reentrancy_cursor(); + + echo "\n\n"; + echo "moving to importer2\n"; + echo "\n\n"; + + $importer2 = WP_Stream_Importer::create_for_wxr_file( + $wxr_path, + array(), + $paused_importer_state + ); + $importer2->next_step(); + $importer2->next_step(); + $importer2->next_step(); + // $importer2->next_step(); + // var_dump($importer2); + + die("YAY"); +}); + // Register admin menu add_action('admin_menu', function() { add_menu_page( @@ -86,7 +121,7 @@ function data_liberation_admin_page() { data_liberation_process_import(); echo ''; } - + ?>

Active import

- +

Import Content

- + @@ -175,7 +210,7 @@ function data_liberation_admin_page() { - + @@ -184,7 +219,7 @@ function data_liberation_admin_page() {

Upload a WordPress eXtended RSS (WXR) file

- + @@ -193,7 +228,7 @@ function data_liberation_admin_page() {

Enter the URL of a WXR file

- + @@ -210,7 +245,7 @@ function data_liberation_admin_page() {

Previous Imports

TODO: Show a table of previous imports.

- +
Import Type
WXR File
WXR URL
Markdown ZIP
@@ -329,7 +364,7 @@ function data_liberation_admin_page() { */ // if(is_wp_error(wp_schedule_event(time(), 'data_liberation_minute', 'data_liberation_process_import'))) { // wp_delete_attachment($attachment_id, true); - // // @TODO: More user friendly error message – maybe redirect back to the import screen and + // // @TODO: More user friendly error message – maybe redirect back to the import screen and // // show the error there. // wp_die('Failed to schedule import – the "data_liberation_minute" schedule may not be registered.'); // } @@ -353,20 +388,9 @@ function data_liberation_process_import() { function data_liberation_import_step($import) { $importer = data_liberation_create_importer($import); - // @TODO: Save the last importer state so we can resume it later if interrupted. - update_option('data_liberation_import_progress', [ - 'status' => 'Downloading static assets...', - 'current' => 0, - 'total' => 0 - ]); - $importer->frontload_assets(); - // @TODO: Keep track of multiple progress dimensions – posts, assets, categories, etc. - update_option('data_liberation_import_progress', [ - 'status' => 'Importing posts...', - 'current' => 0, - 'total' => 0 - ]); - $importer->import_entities(); + while($importer->next_step()) { + // ...Twiddle our thumbs... + } delete_option('data_liberation_active_import'); // @TODO: Do not echo things. Append to an import log where we can retrace the steps. // Also, store specific import events in the database so the user can react and @@ -382,25 +406,13 @@ function data_liberation_create_importer($import) { // @TODO: Save the error, report it to the user. return; } - $entity_iterator_factory = function() use ($wxr_path) { - $wxr = new WP_WXR_Reader(); - $wxr->connect_upstream(new WP_File_Reader($wxr_path)); - - return $wxr; - }; - return WP_Stream_Importer::create( - $entity_iterator_factory + return WP_Stream_Importer::create_for_wxr_file( + $wxr_path ); case 'wxr_url': - $wxr_url = $import['wxr_url']; - $entity_iterator_factory = function() use ($wxr_url) { - $wxr = new WP_WXR_Reader(); - $wxr->connect_upstream(new WP_Remote_File_Reader($wxr_url)); - return $wxr; - }; - return WP_Stream_Importer::create( - $entity_iterator_factory + return WP_Stream_Importer::create_for_wxr_url( + $import['wxr_url'] ); case 'markdown_zip': @@ -419,18 +431,12 @@ function data_liberation_create_importer($import) { } } $markdown_root = $temp_dir; - $entity_iterator_factory = function() use ($markdown_root) { - return new WP_Markdown_Directory_Tree_Reader( - $markdown_root, - 1000 - ); - }; - return WP_Markdown_Importer::create( - $entity_iterator_factory, [ + return WP_Markdown_Importer::create_for_markdown_directory( + $markdown_root, [ 'source_site_url' => 'file://' . $markdown_root, 'local_markdown_assets_root' => $markdown_root, 'local_markdown_assets_url_prefix' => '@site/', ] ); } -} \ No newline at end of file +} diff --git a/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php b/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php index efc80bffd9..6d8bd247db 100644 --- a/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php +++ b/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php @@ -1,8 +1,8 @@ chunk_size = $chunk_size; } - /** - * Really these are just `tell()` and `seek()` operations, only the state is more - * involved than a simple offset. Hmm. - */ - public function pause(): array|bool { - return array( - 'offset_in_file' => $this->offset_in_file, - ); + public function tell(): int { + // Save the previous offset, not the current one. + // This way, after resuming, the next read will yield the same $output_bytes + // as we have now. + return $this->offset_in_file - $this->last_chunk_size; } - public function resume( $paused_state ): bool { + public function seek( $offset_in_file ): bool { + if ( ! is_int( $offset_in_file ) ) { + _doing_it_wrong( __METHOD__, 'Cannot set a file reader cursor to a non-integer offset.', '1.0.0' ); + return false; + } if ( $this->file_pointer ) { - _doing_it_wrong( __METHOD__, 'Cannot resume a file reader that is already initialized.', '1.0.0' ); + _doing_it_wrong( __METHOD__, 'Cannot set a file reader cursor on a file reader that is already initialized.', '1.0.0' ); return false; } - $this->offset_in_file = $paused_state['offset_in_file']; + $this->offset_in_file = $offset_in_file; + $this->last_chunk_size = 0; return true; } @@ -50,7 +53,8 @@ public function get_last_error(): string|null { } public function next_bytes(): bool { - $this->output_bytes = ''; + $this->output_bytes = ''; + $this->last_chunk_size = 0; if ( $this->last_error || $this->is_finished() ) { return false; } @@ -66,7 +70,8 @@ public function next_bytes(): bool { $this->state = static::STATE_FINISHED; return false; } - $this->offset_in_file += strlen( $bytes ); + $this->last_chunk_size = strlen( $bytes ); + $this->offset_in_file += $this->last_chunk_size; $this->output_bytes .= $bytes; return true; } diff --git a/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php b/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php index e9e19ba0b1..819480d8fe 100644 --- a/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php +++ b/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php @@ -22,6 +22,19 @@ public function __construct( $url ) { $this->url = $url; } + public function tell(): int { + return $this->bytes_already_read + $this->skip_bytes; + } + + public function seek( $offset_in_file ): bool { + if ( $this->request ) { + _doing_it_wrong( __METHOD__, 'Cannot set a remote file reader cursor on a remote file reader that is already initialized.', '1.0.0' ); + return false; + } + $this->skip_bytes = $offset_in_file; + return true; + } + public function next_bytes(): bool { if ( null === $this->request ) { $this->request = new WordPress\AsyncHttp\Request( @@ -90,21 +103,6 @@ public function get_bytes(): string|null { return $this->current_chunk; } - public function pause(): array|bool { - return array( - 'offset_in_file' => $this->bytes_already_read + $this->skip_bytes, - ); - } - - public function resume( $paused_state ): bool { - if ( $this->request ) { - _doing_it_wrong( __METHOD__, 'Cannot resume a remote file reader that is already initialized.', '1.0.0' ); - return false; - } - $this->skip_bytes = $paused_state['offset_in_file']; - return true; - } - public function is_finished(): bool { return $this->is_finished; } diff --git a/packages/playground/data-liberation/src/import/WP_Attachment_Downloader.php b/packages/playground/data-liberation/src/import/WP_Attachment_Downloader.php index a54ce96eda..a6be3e74f0 100644 --- a/packages/playground/data-liberation/src/import/WP_Attachment_Downloader.php +++ b/packages/playground/data-liberation/src/import/WP_Attachment_Downloader.php @@ -7,23 +7,25 @@ class WP_Attachment_Downloader { private $client; private $fps = array(); private $output_root; - private $partial_files = array(); - private $output_paths = array(); + private $output_paths = array(); + + private $current_event; + private $pending_events = array(); + private $downloads_so_far = 0; + private $enqueued_resource_id; public function __construct( $output_root ) { $this->client = new Client(); $this->output_root = $output_root; } - public function enqueue_if_not_exists( $url, $output_path = null ) { - if ( null === $output_path ) { - // Use the path from the URL. - $parsed_url = parse_url( $url ); - if ( false === $parsed_url ) { - return false; - } - $output_path = $parsed_url['path']; - } + public function has_pending_requests() { + return count( $this->client->get_active_requests() ) > 0; + } + + public function enqueue_if_not_exists( $url, $output_path ) { + $this->enqueued_resource_id = null; + $output_path = $this->output_root . '/' . ltrim( $output_path, '/' ); if ( file_exists( $output_path ) ) { // @TODO: Reconsider the return value. The enqueuing operation failed, @@ -42,18 +44,28 @@ public function enqueue_if_not_exists( $url, $output_path = null ) { return false; } + ++$this->downloads_so_far; switch ( $protocol ) { case 'file': $local_path = parse_url( $url, PHP_URL_PATH ); if ( false === $local_path ) { return false; } + // Just copy the file over. // @TODO: think through the chmod of the created file. - return copy( $local_path, $output_path ); + + $this->enqueued_resource_id = 'file:' . $this->downloads_so_far; + $success = copy( $local_path, $output_path ); + $this->pending_events[] = new WP_Attachment_Downloader_Event( + $this->enqueued_resource_id, + $success ? WP_Attachment_Downloader_Event::SUCCESS : WP_Attachment_Downloader_Event::FAILURE + ); + return true; case 'http': case 'https': $request = new Request( $url ); + $this->enqueued_resource_id = 'http:' . $request->id; $this->output_paths[ $request->id ] = $output_path; $this->client->enqueue( $request ); return true; @@ -61,10 +73,28 @@ public function enqueue_if_not_exists( $url, $output_path = null ) { return false; } + public function get_enqueued_resource_id() { + return $this->enqueued_resource_id; + } + public function queue_full() { return count( $this->client->get_active_requests() ) >= 10; } + public function get_event() { + return $this->current_event; + } + + public function next_event() { + $this->current_event = null; + if ( count( $this->pending_events ) === 0 ) { + return false; + } + + $this->current_event = array_shift( $this->pending_events ); + return true; + } + public function poll() { if ( ! $this->client->await_next_event() ) { return false; @@ -75,54 +105,63 @@ public function poll() { // Let's keep referring to the original request. $original_request_id = $request->original_request()->id; - switch ( $event ) { - case Client::EVENT_GOT_HEADERS: - if ( ! $request->is_redirected() ) { - $this->partial_files[ $original_request_id ] = $this->output_paths[ $original_request_id ] . '.partial'; - if ( file_exists( $this->partial_files[ $original_request_id ] ) ) { - unlink( $this->partial_files[ $original_request_id ] ); + while ( true ) { + switch ( $event ) { + case Client::EVENT_GOT_HEADERS: + if ( ! $request->is_redirected() ) { + if ( file_exists( $this->output_paths[ $original_request_id ] . '.partial' ) ) { + unlink( $this->output_paths[ $original_request_id ] . '.partial' ); + } + $this->fps[ $original_request_id ] = fopen( $this->output_paths[ $original_request_id ] . '.partial', 'wb' ); + if ( false === $this->fps[ $original_request_id ] ) { + // @TODO: Log an error. + } } - $this->fps[ $original_request_id ] = fopen( $this->output_paths[ $original_request_id ] . '.partial', 'wb' ); - if ( false === $this->fps[ $original_request_id ] ) { + break; + case Client::EVENT_BODY_CHUNK_AVAILABLE: + $chunk = $this->client->get_response_body_chunk(); + if ( false === fwrite( $this->fps[ $original_request_id ], $chunk ) ) { // @TODO: Log an error. } - } - break; - case Client::EVENT_BODY_CHUNK_AVAILABLE: - $chunk = $this->client->get_response_body_chunk(); - if ( false === fwrite( $this->fps[ $original_request_id ], $chunk ) ) { - // @TODO: Log an error. - } - break; - case Client::EVENT_FAILED: - if ( isset( $this->fps[ $original_request_id ] ) ) { - fclose( $this->fps[ $original_request_id ] ); - } - if ( isset( $this->partial_files[ $original_request_id ] ) ) { - $partial_file = $this->output_root . '/' . $this->partial_files[ $original_request_id ] . '.partial'; - if ( file_exists( $partial_file ) ) { - unlink( $partial_file ); - } - } - unset( $this->output_paths[ $original_request_id ] ); - break; - case Client::EVENT_FINISHED: - if ( ! $request->is_redirected() ) { - // Only clean up if this was the last request in the chain. + break; + case Client::EVENT_FAILED: if ( isset( $this->fps[ $original_request_id ] ) ) { fclose( $this->fps[ $original_request_id ] ); } - if ( isset( $this->output_paths[ $original_request_id ] ) && isset( $this->partial_files[ $original_request_id ] ) ) { - if ( false === rename( - $this->partial_files[ $original_request_id ], - $this->output_paths[ $original_request_id ] - ) ) { - // @TODO: Log an error. + if ( isset( $this->output_paths[ $original_request_id ] ) ) { + $partial_file = $this->output_root . '/' . $this->output_paths[ $original_request_id ] . '.partial'; + if ( file_exists( $partial_file ) ) { + unlink( $partial_file ); } } - unset( $this->partial_files[ $original_request_id ] ); - } - break; + $this->pending_events[] = new WP_Attachment_Downloader_Event( + 'http:' . $original_request_id, + WP_Attachment_Downloader_Event::FAILURE + ); + unset( $this->output_paths[ $original_request_id ] ); + break; + case Client::EVENT_FINISHED: + if ( ! $request->is_redirected() ) { + // Only clean up if this was the last request in the chain. + if ( isset( $this->fps[ $original_request_id ] ) ) { + fclose( $this->fps[ $original_request_id ] ); + } + if ( isset( $this->output_paths[ $original_request_id ] ) ) { + if ( false === rename( + $this->output_paths[ $original_request_id ] . '.partial', + $this->output_paths[ $original_request_id ] + ) ) { + // @TODO: Log an error. + } + } + $this->pending_events[] = new WP_Attachment_Downloader_Event( + 'http:' . $original_request_id, + WP_Attachment_Downloader_Event::SUCCESS + ); + unset( $this->output_paths[ $original_request_id ] ); + } + break; + } } return true; diff --git a/packages/playground/data-liberation/src/import/WP_Attachment_Downloader_Event.php b/packages/playground/data-liberation/src/import/WP_Attachment_Downloader_Event.php new file mode 100644 index 0000000000..b759ebb8fb --- /dev/null +++ b/packages/playground/data-liberation/src/import/WP_Attachment_Downloader_Event.php @@ -0,0 +1,15 @@ +resource_id = $resource_id; + $this->type = $type; + } +} diff --git a/packages/playground/data-liberation/src/import/WP_Markdown_Importer.php b/packages/playground/data-liberation/src/import/WP_Markdown_Importer.php index f4551a35a0..e76e6d643f 100644 --- a/packages/playground/data-liberation/src/import/WP_Markdown_Importer.php +++ b/packages/playground/data-liberation/src/import/WP_Markdown_Importer.php @@ -2,12 +2,14 @@ class WP_Markdown_Importer extends WP_Stream_Importer { - public static function create( - $entity_iterator_factory, - $options = array() - ) { - $options = static::parse_options( $options ); - return new WP_Markdown_Importer( $entity_iterator_factory, $options ); + public static function create_for_markdown_directory( $markdown_directory, $options = array(), $cursor = null ) { + return static::create( + function ( $cursor = null ) use ( $markdown_directory ) { + return WP_Markdown_Importer::create( $markdown_directory, $cursor ); + }, + $options, + $cursor + ); } protected static function parse_options( $options ) { diff --git a/packages/playground/data-liberation/src/import/WP_Stream_Importer.php b/packages/playground/data-liberation/src/import/WP_Stream_Importer.php index 5258fb9cae..cc4ea76c30 100644 --- a/packages/playground/data-liberation/src/import/WP_Stream_Importer.php +++ b/packages/playground/data-liberation/src/import/WP_Stream_Importer.php @@ -1,4 +1,8 @@ tag. */ - protected $source_site_url; + private $source_site_url; private $entity_iterator_factory; /** * @param array|string|null $query { @@ -38,18 +43,90 @@ class WP_Stream_Importer { * after the import. E.g. http://127.0.0.1:9400/wp-content/uploads/ * } */ - protected $options; - protected $downloader; + private $options; + + const STAGE_INITIAL = '#initial'; + const STAGE_TOPOLOGICAL_SORT = '#topological_sort'; + const STAGE_FRONTLOAD_ASSETS = '#frontload_assets'; + const STAGE_IMPORT_ENTITIES = '#import_entities'; + const STAGE_FINISHED = '#finished'; + + /** + * The current state of the import process. + * @var string + */ + private $stage = self::STAGE_INITIAL; + + /** + * Iterator that streams entities to import. + */ + private $entity_iterator; + private $resume_at_entity; + /** + * A map of currently downloaded resources for each entity in + * the following format: + * + * [$entity_cursor => [$resource_id => true]] + * + * @var array> + */ + private $active_downloads = array(); + private $downloader; + + public static function create_for_wxr_file( $wxr_path, $options = array(), $cursor = null ) { + return static::create( + function ( $cursor = null ) use ( $wxr_path ) { + return WP_WXR_Reader::create( new WP_File_Reader( $wxr_path ), $cursor ); + }, + $options, + $cursor + ); + } + + public static function create_for_wxr_url( $wxr_url, $options = array(), $cursor = null ) { + return static::create( + function ( $cursor = null ) use ( $wxr_url ) { + return WP_WXR_Reader::create( new WP_Remote_File_Reader( $wxr_url ), $cursor ); + }, + $options, + $cursor + ); + } public static function create( $entity_iterator_factory, - $options = array() + $options = array(), + $cursor = null ) { - $options = static::parse_options( $options ); - return new WP_Stream_Importer( $entity_iterator_factory, $options ); + $options = static::parse_options( $options ); + $importer = new WP_Stream_Importer( $entity_iterator_factory, $options ); + if ( null !== $cursor && true !== $importer->initialize_from_cursor( $cursor ) ) { + return false; + } + return $importer; + } + + public function get_reentrancy_cursor() { + return json_encode( + array( + 'state' => $this->stage, + 'resume_at_entity' => $this->resume_at_entity, + ) + ); + } + + private function initialize_from_cursor( $cursor ) { + $cursor = json_decode( $cursor, true ); + if ( ! is_array( $cursor ) ) { + _doing_it_wrong( __METHOD__, 'Cannot resume an importer with a non-array cursor.', '1.0.0' ); + return false; + } + $this->stage = $cursor['state']; + $this->resume_at_entity = $cursor['resume_at_entity']; + return true; } - protected static function parse_options( $options ) { + private static function parse_options( $options ) { if ( ! isset( $options['new_site_url'] ) ) { $options['new_site_url'] = get_site_url(); } @@ -69,7 +146,7 @@ protected static function parse_options( $options ) { return $options; } - protected function __construct( + private function __construct( $entity_iterator_factory, $options = array() ) { @@ -80,6 +157,72 @@ protected function __construct( } } + /** + * The WordPress entity importer instance. + * @TODO: Consider inlining the importer code into this class. + * + * @var WP_Entity_Importer + */ + private $importer; + + public function next_step() { + switch ( $this->stage ) { + case self::STAGE_INITIAL: + $this->stage = self::STAGE_TOPOLOGICAL_SORT; + return true; + case self::STAGE_TOPOLOGICAL_SORT: + // @TODO: Topologically sort the entities. + $this->stage = self::STAGE_FRONTLOAD_ASSETS; + return true; + case self::STAGE_FRONTLOAD_ASSETS: + $this->next_frontloading_step(); + return true; + case self::STAGE_IMPORT_ENTITIES: + $this->import_next_entity(); + return true; + case self::STAGE_FINISHED: + return false; + } + } + + /** + * Advance the cursor to the oldest finished download. For example: + * + * * We've started downloading files A, B, C, and D in this order. + * * D is the first to finish. We don't do anything yet. + * * A finishes next. We advance the cursor to A. + * * C finishes next. We don't do anything. + * * Then we pause. + * + * When we resume, we'll start where we left off, which is after A. The + * downloader will enqueue B for download and will skip C and D since + * the relevant files already exist in the filesystem. + */ + private function frontloading_advance_reentrancy_cursor() { + while ( $this->downloader->next_event() ) { + $event = $this->downloader->get_event(); + switch ( $event->type ) { + case WP_Attachment_Downloader_Event::SUCCESS: + case WP_Attachment_Downloader_Event::FAILURE: + foreach ( array_keys( $this->active_downloads ) as $entity_cursor ) { + unset( $this->active_downloads[ $entity_cursor ][ $event->resource_id ] ); + } + break; + } + } + + while ( count( $this->active_downloads ) > 0 ) { + $oldest_download_cursor = key( $this->active_downloads ); + $downloads_completed = empty( $this->active_downloads[ $oldest_download_cursor ] ); + if ( ! $downloads_completed ) { + break; + } + // Advance the cursor to the next entity. + $this->resume_at_entity = $oldest_download_cursor; + unset( $this->active_downloads[ $oldest_download_cursor ] ); + } + } + /** * Downloads all the assets referenced in the imported entities. * @@ -87,46 +230,91 @@ protected function __construct( * before import_entities() so that every inserted post already has * all its attachments downloaded. */ - public function frontload_assets() { - $factory = $this->entity_iterator_factory; - $entities = $factory(); - $this->downloader = new WP_Attachment_Downloader( $this->options['uploads_path'] ); - foreach ( $entities as $entity ) { - if ( $this->downloader->queue_full() ) { - $this->downloader->poll(); - continue; - } + private function next_frontloading_step() { + if ( null === $this->entity_iterator ) { + $this->entity_iterator = $this->create_entity_iterator(); + $this->downloader = new WP_Attachment_Downloader( $this->options ); + } - $data = $entity->get_data(); - if ( 'site_option' === $entity->get_type() && $data['option_name'] === 'home' ) { - $this->source_site_url = $data['option_value']; - } elseif ( 'post' === $entity->get_type() ) { - if ( isset( $data['post_type'] ) && $data['post_type'] === 'attachment' ) { - // Download media attachment entities. - $this->enqueue_attachment_download( - $data['attachment_url'] - ); - } elseif ( isset( $data['post_content'] ) ) { - $this->enqueue_attachments_referenced_in_post( - $data - ); - } + $this->frontloading_advance_reentrancy_cursor(); + + // We're done if all the entities are processed and all the downloads are finished. + if ( ! $this->entity_iterator->valid() && ! $this->downloader->has_pending_requests() ) { + // This is an assertion to make double sure we're emptying the state queue. + if ( ! empty( $this->active_downloads ) ) { + _doing_it_wrong( __METHOD__, 'Frontloading queue is not empty.', '1.0' ); } + $this->stage = self::STAGE_IMPORT_ENTITIES; + $this->downloader = null; + $this->active_downloads = array(); + $this->entity_iterator = null; + $this->resume_at_entity = null; + return false; } - while ( $this->downloader->poll() ) { - // Twiddle our thumbs as the downloader processes the requests... + // Poll the bytes between scheduling new downloads. + $only_downloader_pending = ! $this->entity_iterator->valid() && $this->downloader->has_pending_requests(); + if ( $this->downloader->queue_full() || $only_downloader_pending ) { /** * @TODO: - * * Consider inlining the downloader code into this class. * * Process and store failures. * E.g. what if the attachment is not found? Error out? Ignore? In a UI-based * importer scenario, this is the time to log a failure to let the user * fix it later on. In a CLI-based Blueprint step importer scenario, we * might want to provide an "image not found" placeholder OR ignore the * failure. + * + * @TODO: Update the download progress: + * * After every downloaded file. + * * For large files, every time a full megabyte is downloaded above 10MB. */ + return $this->downloader->poll(); + } + + /** + * Identify the static assets referenced in the current entity + * and enqueue them for download. + */ + $entity = $this->entity_iterator->current(); + $cursor = $this->entity_iterator->get_reentrancy_cursor(); + $this->active_downloads[ $cursor ] = array(); + + $data = $entity->get_data(); + switch ( $entity->get_type() ) { + case 'site_option': + if ( $data['option_name'] === 'home' ) { + $this->source_site_url = $data['option_value']; + } + break; + case 'post': + if ( isset( $data['post_type'] ) && $data['post_type'] === 'attachment' ) { + $this->enqueue_attachment_download( $data['attachment_url'], null ); + } elseif ( isset( $data['post_content'] ) ) { + $post = $data; + $p = new WP_Block_Markup_Url_Processor( $post['post_content'], $this->source_site_url ); + while ( $p->next_url() ) { + if ( ! $this->url_processor_matched_asset_url( $p ) ) { + continue; + } + $this->enqueue_attachment_download( + $p->get_raw_url(), + $post['source_path'] ?? $post['slug'] ?? null + ); + } + } + break; } + + /** + * @TODO: Update the progress information. + * @TODO: Save the freshly requested URLs to the cursor. + */ + + // Move on to the next entity. + $this->entity_iterator->next(); + + $this->frontloading_advance_reentrancy_cursor(); + return true; } /** @@ -136,55 +324,87 @@ public function frontload_assets() { * large datasets, but maybe it could be a choice for * the API consumer? */ - public function import_entities() { - $importer = new WP_Entity_Importer(); - $factory = $this->entity_iterator_factory; - $entities = $factory(); - foreach ( $entities as $entity ) { - $attachments = array(); - // Rewrite the URLs in the post. - switch ( $entity->get_type() ) { - case 'post': - $data = $entity->get_data(); - foreach ( array( 'guid', 'post_content', 'post_excerpt' ) as $key ) { - if ( ! isset( $data[ $key ] ) ) { - continue; - } - $p = new WP_Block_Markup_Url_Processor( $data[ $key ], $this->source_site_url ); - while ( $p->next_url() ) { - if ( $this->url_processor_matched_asset_url( $p ) ) { - $filename = $this->new_asset_filename( $p->get_raw_url() ); - $new_asset_url = $this->options['uploads_url'] . '/' . $filename; - $p->replace_base_url( WP_URL::parse( $new_asset_url ) ); - $attachments[] = $new_asset_url; - /** - * @TODO: How would we know a specific image block refers to a specific - * attachment? We need to cross-correlate that to rewrite the URL. - * The image block could have query parameters, too, but presumably the - * path would be the same at least? What if the same file is referred - * to by two different URLs? e.g. assets.site.com and site.com/assets/ ? - * A few ideas: GUID, block attributes, fuzzy matching. Maybe a configurable - * strategy? And the API consumer would make the decision? - */ - } elseif ( $this->source_site_url && - $p->get_parsed_url() && - url_matches( $p->get_parsed_url(), $this->source_site_url ) - ) { - $p->replace_base_url( WP_URL::parse( $this->options['new_site_url'] ) ); - } else { - // Ignore other URLs. - } + private function import_next_entity() { + if ( null === $this->entity_iterator ) { + $this->entity_iterator = $this->create_entity_iterator(); + $this->importer = new WP_Entity_Importer(); + } + + if ( ! $this->entity_iterator->valid() ) { + // We're done. + $this->stage = self::STAGE_FINISHED; + $this->entity_iterator = null; + $this->importer = null; + return; + } + + $entity = $this->entity_iterator->current(); + $attachments = array(); + // Rewrite the URLs in the post. + switch ( $entity->get_type() ) { + case 'post': + $data = $entity->get_data(); + foreach ( array( 'guid', 'post_content', 'post_excerpt' ) as $key ) { + if ( ! isset( $data[ $key ] ) ) { + continue; + } + $p = new WP_Block_Markup_Url_Processor( $data[ $key ], $this->source_site_url ); + while ( $p->next_url() ) { + if ( $this->url_processor_matched_asset_url( $p ) ) { + $filename = $this->new_asset_filename( $p->get_raw_url() ); + $new_asset_url = $this->options['uploads_url'] . '/' . $filename; + $p->replace_base_url( WP_URL::parse( $new_asset_url ) ); + $attachments[] = $new_asset_url; + /** + * @TODO: How would we know a specific image block refers to a specific + * attachment? We need to cross-correlate that to rewrite the URL. + * The image block could have query parameters, too, but presumably the + * path would be the same at least? What if the same file is referred + * to by two different URLs? e.g. assets.site.com and site.com/assets/ ? + * A few ideas: GUID, block attributes, fuzzy matching. Maybe a configurable + * strategy? And the API consumer would make the decision? + */ + } elseif ( $this->source_site_url && + $p->get_parsed_url() && + url_matches( $p->get_parsed_url(), $this->source_site_url ) + ) { + $p->replace_base_url( WP_URL::parse( $this->options['new_site_url'] ) ); + } else { + // Ignore other URLs. } - $data[ $key ] = $p->get_updated_html(); } - $entity->set_data( $data ); - break; - } - $post_id = $importer->import_entity( $entity ); - foreach ( $attachments as $filepath ) { - $importer->import_attachment( $filepath, $post_id ); - } + $data[ $key ] = $p->get_updated_html(); + } + $entity->set_data( $data ); + break; + } + + // @TODO: Monitor failures. + $post_id = $this->importer->import_entity( $entity ); + foreach ( $attachments as $filepath ) { + // @TODO: Monitor failures. + $this->importer->import_attachment( $filepath, $post_id ); } + + /** + * @TODO: Update the progress information. + */ + $this->resume_at_entity = $this->entity_iterator->get_reentrancy_cursor(); + $this->entity_iterator->next(); + } + + private function enqueue_attachment_download( string $raw_url, $context_path = null ) { + $url = $this->rewrite_attachment_url( $raw_url, $context_path ); + $asset_filename = $this->new_asset_filename( $raw_url ); + $output_path = $this->options['uploads_path'] . '/' . ltrim( $asset_filename, '/' ); + + $enqueued = $this->downloader->enqueue_if_not_exists( $url, $output_path ); + if ( $enqueued ) { + $resource_id = $this->downloader->get_last_enqueued_resource_id(); + $entity_cursor = $this->entity_iterator->get_reentrancy_cursor(); + $this->active_downloads[ $entity_cursor ][ $resource_id ] = true; + } + return $enqueued; } /** @@ -234,53 +454,7 @@ private function new_asset_filename( string $raw_asset_url ) { return $filename; } - /** - * Infers and enqueues the attachments URLs from the post content. - * - * Why not just emit the attachment URLs from WP_Markdown_Directory_Tree_Reader - * as other entities? - * - * Whether it's Markdown, static HTML, or another static file format, - * we'll need to recover the attachment URLs from the We can either - * have a separate pipeline step for that, or burden every format - * reader with reimplementing the same logic. So let's just keep it - * separated. - */ - protected function enqueue_attachments_referenced_in_post( $post ) { - $p = new WP_Block_Markup_Url_Processor( $post['post_content'], $this->source_site_url ); - while ( $p->next_url() ) { - if ( ! $this->url_processor_matched_asset_url( $p ) ) { - continue; - } - - $enqueued = $this->enqueue_attachment_download( - $p->get_raw_url(), - $post['source_path'] ?? $post['slug'] ?? null - ); - if ( false === $enqueued ) { - continue; - } - } - } - - protected function enqueue_attachment_download( string $raw_url, $context_path = null ) { - $new_filename = $this->new_asset_filename( $raw_url ); - $downloadable_url = $this->rewrite_attachment_url( $raw_url, $context_path ); - $success = $this->downloader->enqueue_if_not_exists( - $downloadable_url, - $new_filename - ); - if ( false === $success ) { - // @TODO: Save the failure info somewhere so the user can review it later - // and either retry or provide their own asset. - // Meanwhile, we may either halt the content import, or provide a placeholder - // asset. - _doing_it_wrong( __METHOD__, "Failed to fetch attachment '$raw_url' from '$downloadable_url'", '__WP_VERSION__' ); - } - return $success; - } - - protected function rewrite_attachment_url( string $raw_url, $context_path = null ) { + private function rewrite_attachment_url( string $raw_url, $context_path = null ) { if ( WP_URL::can_parse( $raw_url ) ) { // Absolute URL, nothing to do. return $raw_url; @@ -303,11 +477,16 @@ protected function rewrite_attachment_url( string $raw_url, $context_path = null * @TODO: How can we process the videos? * @TODO: What other asset types are there? */ - protected function url_processor_matched_asset_url( WP_Block_Markup_Url_Processor $p ) { + private function url_processor_matched_asset_url( WP_Block_Markup_Url_Processor $p ) { return ( $p->get_tag() === 'IMG' && $p->get_inspected_attribute_name() === 'src' && ( ! $this->source_site_url || url_matches( $p->get_parsed_url(), $this->source_site_url ) ) ); } + + private function create_entity_iterator() { + $factory = $this->entity_iterator_factory; + return $factory( $this->resume_at_entity ); + } } diff --git a/packages/playground/data-liberation/src/markdown-api/WP_Markdown_Directory_Tree_Reader.php b/packages/playground/data-liberation/src/markdown-api/WP_Markdown_Directory_Tree_Reader.php index e69571b3af..da7514eb89 100644 --- a/packages/playground/data-liberation/src/markdown-api/WP_Markdown_Directory_Tree_Reader.php +++ b/packages/playground/data-liberation/src/markdown-api/WP_Markdown_Directory_Tree_Reader.php @@ -4,6 +4,9 @@ * * This exploration accompanies the WXR reader to inform a generic * data importing pipeline that's not specific to a single input format. + * + * @TODO: Support multiple data sources – filesystem directory tree, zip file, ... + * @TODO: Expose a cursor to allow resuming from where we left off. */ class WP_Markdown_Directory_Tree_Reader implements Iterator { @@ -14,7 +17,8 @@ class WP_Markdown_Directory_Tree_Reader implements Iterator { private $pending_files = array(); private $parent_ids = array(); private $next_post_id; - private $is_finished = false; + private $is_finished = false; + private $entities_read_so_far = 0; public function __construct( $root_dir, $first_post_id ) { $this->file_visitor = new WP_File_Visitor( realpath( $root_dir ) ); @@ -38,6 +42,7 @@ public function next_entity() { } $post_id = $this->next_post_id; ++$this->next_post_id; + ++$this->entities_read_so_far; $this->entity = $this->markdown_to_post_entity( array( 'markdown' => $markdown, @@ -66,6 +71,7 @@ public function next_entity() { ) ); ++$this->next_post_id; + ++$this->entities_read_so_far; return true; } @@ -264,7 +270,7 @@ public function next(): void { } public function key(): int { - return 0; + return $this->entities_read_so_far - 1; } public function valid(): bool { diff --git a/packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php b/packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php index c88ce0107d..c37d952714 100644 --- a/packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php +++ b/packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php @@ -118,9 +118,7 @@ * * @TODO: * - * - Save parser state after each entity or every `n` entities to speed it up. Then also save the `n` - * for a quick rewind after resuming. - * - Resume parsing from saved state. + * - Revisit the need to implement the Iterator interface. * * @since WP_VERSION */ @@ -175,6 +173,14 @@ class WP_WXR_Reader implements Iterator { */ private $entity_finished = false; + /** + * The number of entities read so far. + * + * @since WP_VERSION + * @var int + */ + private $entities_read_so_far = 0; + /** * The attributes from the last opening tag. * @@ -334,34 +340,42 @@ class WP_WXR_Reader implements Iterator { ), ); - public function pause() { - $upstream_state = $this->upstream ? $this->upstream->pause() : null; - if ( $upstream_state ) { - // @TODO: Don't assume this specific key name. Find a way to generalize - // this to, e.g., remote HTTP byte sources. - $upstream_state['offset_in_file'] = $this->entity_byte_offset; + public static function create( WP_Byte_Reader $upstream = null, $cursor = null ) { + $xml_cursor = null; + if ( null !== $cursor ) { + $cursor = json_decode( $cursor, true ); + if ( false === $cursor ) { + _doing_it_wrong( + __METHOD__, + 'Invalid cursor provided for WP_WXR_Reader::create().', + null + ); + return false; + } + $xml_cursor = $cursor['xml']; } - return array( - 'xml' => $this->xml->pause(), - 'upstream' => $upstream_state, - 'last_post_id' => $this->last_post_id, - 'last_comment_id' => $this->last_comment_id, - ); - } - public function resume( $paused_state ) { - // @TODO: Validate the paused state. - if ( $paused_state['upstream'] ) { - if ( ! $this->upstream ) { - // @TODO: _doing_it_wrong() - return false; + $xml = WP_XML_Processor::create_for_streaming( '', $xml_cursor ); + $reader = new WP_WXR_Reader( $xml ); + if ( null !== $cursor ) { + $reader->last_post_id = $cursor['last_post_id']; + $reader->last_comment_id = $cursor['last_comment_id']; + } + if ( null !== $upstream ) { + $reader->connect_upstream( $upstream ); + if ( null !== $cursor ) { + if ( ! isset( $cursor['upstream'] ) ) { + _doing_it_wrong( + __METHOD__, + 'Invalid cursor provided for WP_WXR_Reader::create(). The upstream offset was missing.', + null + ); + return false; + } + $upstream->seek( $cursor['upstream'] ); } - $this->upstream->resume( $paused_state['upstream'] ); } - $this->xml->resume( $paused_state['xml'] ); - $this->last_post_id = $paused_state['last_post_id']; - $this->last_comment_id = $paused_state['last_comment_id']; - $this->next_entity(); + return $reader; } /** @@ -371,8 +385,30 @@ public function resume( $paused_state ) { * * @param WP_XML_Processor $xml The XML processor to use. */ - public function __construct() { - $this->xml = WP_XML_Processor::create_for_streaming(); + protected function __construct( WP_XML_Processor $xml ) { + $this->xml = $xml; + } + + public function get_reentrancy_cursor() { + /** + * @TODO: Instead of adjusting the XML cursor internals, adjust the get_reentrancy_cursor() + * call to support $bookmark_name, e.g. $this->xml->get_reentrancy_cursor( 'last_entity' ); + * If the cursor internal data was a part of every bookmark, this would have worked + * even after evicting the actual bytes where $last_entity is stored. + */ + $xml_cursor = $this->xml->get_reentrancy_cursor(); + $xml_cursor = json_decode( base64_decode( $xml_cursor ), true ); + $xml_cursor['upstream_bytes_forgotten'] = $this->entity_byte_offset; + $xml_cursor = base64_encode( json_encode( $xml_cursor ) ); + return json_encode( + array( + 'xml' => $xml_cursor, + // WP_Byte_Reader cursors are always integer byte offsets in the stream. + 'upstream' => $this->entity_byte_offset, + 'last_post_id' => $this->last_post_id, + 'last_comment_id' => $this->last_comment_id, + ) + ); } /** @@ -603,7 +639,7 @@ private function read_next_entity() { if ( $this->xml->is_tag_opener() ) { $this->set_entity_tag( $tag ); if ( array_key_exists( $this->xml->get_tag(), static::KNOWN_ENITIES ) ) { - $this->entity_byte_offset = $this->get_current_byte_offset(); + $this->entity_byte_offset = $this->xml->get_token_byte_offset_in_the_input_stream(); } } continue; @@ -659,7 +695,7 @@ private function read_next_entity() { array_key_exists( $this->xml->get_tag(), static::KNOWN_SITE_OPTIONS ) ); if ( $is_site_option_opener ) { - $this->entity_byte_offset = $this->get_current_byte_offset(); + $this->entity_byte_offset = $this->xml->get_token_byte_offset_in_the_input_stream(); } continue; } @@ -800,18 +836,6 @@ private function pull_upstream_bytes() { return true; } - /** - * Returns current's XML token offset in the input stream. - * - * @since WP_VERSION - * - * @return int The current byte offset. - */ - private function get_current_byte_offset() { - $paused_xml_state = $this->xml->pause(); - return $paused_xml_state['token_byte_offset_in_the_input_stream']; - } - /** * Marks the current entity as emitted and updates tracking variables. * @@ -834,6 +858,7 @@ private function emit_entity() { $this->entity_data['taxonomy'] = 'category'; } $this->entity_finished = true; + ++$this->entities_read_so_far; } /** @@ -879,8 +904,8 @@ public function next(): void { $this->last_next_result = $this->next_entity(); } - public function key(): int { - return 0; + public function key(): string { + return $this->get_reentrancy_cursor(); } public function valid(): bool { @@ -888,6 +913,10 @@ public function valid(): bool { } public function rewind(): void { - // noop + _doing_it_wrong( + __METHOD__, + 'WP_WXR_Reader does not support rewinding.', + null + ); } } diff --git a/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php b/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php index 6055854192..9dce95e3fc 100644 --- a/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php +++ b/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php @@ -22,8 +22,7 @@ * starting with 1.0, however, because most that's what most WXR * files declare. * - * @TODO: Scrutinize pause() and resume() methods. Can we avoid exposing string - * indices and other internal state? + * @TODO: Include the cursor string in internal bookmarks and use it for seeking. * * @TODO: Track specific error states, expose informative messages, line * numbers, indexes, and other debugging info. @@ -387,7 +386,7 @@ class WP_XML_Processor { protected $expecting_more_input = true; /** - * How many bytes from the original XML document have been read and parsed. + * How many bytes from the current XML chunk have been read and parsed. * * This value points to the latest byte offset in the input document which * has been already parsed. It is the internal cursor for the Tag Processor @@ -405,7 +404,7 @@ class WP_XML_Processor { * @since WP_VERSION * @var int */ - public $bytes_already_forgotten = 0; + public $upstream_bytes_forgotten = 0; /** * Byte offset in input document where current token starts. @@ -647,54 +646,90 @@ class WP_XML_Processor { /** * */ - public static function create_from_string( $xml, $known_definite_encoding = 'UTF-8' ) { - if ( 'UTF-8' !== $known_definite_encoding ) { - return null; + public static function create_from_string( $xml, $cursor = null, $known_definite_encoding = 'UTF-8' ) { + $processor = static::create_for_streaming( $xml, $cursor, $known_definite_encoding ); + if ( null === $processor ) { + return false; } - - $processor = new WP_XML_Processor( $xml, self::CONSTRUCTOR_UNLOCK_CODE ); $processor->input_finished(); return $processor; } - public static function create_for_streaming( $xml = '', $known_definite_encoding = 'UTF-8' ) { + public static function create_for_streaming( $xml = '', $cursor = null, $known_definite_encoding = 'UTF-8' ) { if ( 'UTF-8' !== $known_definite_encoding ) { - return null; + return false; } - return new WP_XML_Processor( $xml, self::CONSTRUCTOR_UNLOCK_CODE ); + $processor = new WP_XML_Processor( $xml, self::CONSTRUCTOR_UNLOCK_CODE ); + if ( null !== $cursor && true !== $processor->initialize_from_cursor( $cursor ) ) { + return false; + } + return $processor; } /** - * Pauses the processor and returns an array of the information needed to resume. + * Returns a re-entrancy cursor – it's a string that can instruct a new XML + * Processor instance to continue parsing from the current location in the + * document. * - * @TODO: - * – What to do with bookmarks when pausing? - * – Consider including all the below information in internal bookmarks. Consider using a logic - * similar to resume() in seek(). - * – Consider a WP_XML_Processor_Paused_State or a WP_XML_Processor_Bookmark class. - * – Should we flush the enqueued lexical updates first? - */ - public function pause() { - return array( - 'token_byte_offset_in_the_input_stream' => $this->bytes_already_forgotten + $this->token_starts_at, - 'bytes_already_forgotten' => $this->bytes_already_forgotten, - 'parser_context' => $this->parser_context, - 'stack_of_open_elements' => $this->stack_of_open_elements, - 'expecting_more_input' => $this->expecting_more_input, + * The only stable part of this API is the return type of string. The consumer + * of this method MUST NOT assume any specific structure of the returned + * string. It will change without a warning between WordPress releases. + * + * This is not a tell() API. No XML Processor method will accept the cursor + * to move to another location. The only way to use this cursor is creating + * a new XML Processor instance. If you need to move around the document, use + * `set_bookmark()` and `seek()`. + */ + public function get_reentrancy_cursor() { + return base64_encode( + json_encode( + array( + 'upstream_bytes_forgotten' => $this->upstream_bytes_forgotten, + 'parser_context' => $this->parser_context, + 'stack_of_open_elements' => $this->stack_of_open_elements, + 'expecting_more_input' => $this->expecting_more_input, + ) + ) ); } /** - * @TODO: - * – Validate the paused state, return false if it's invalid. + * Returns the byte offset in the input stream where the current token starts. + * + * You should probably not use this method. + * + * It's only exists to allow resuming the input stream at the same offset where + * the XML parsing was finished. It will never expose any attribute's byte + * offset and no method in the XML processor API will ever accept the byte offset + * to move to another location. If you need to move around the document, use + * `set_bookmark()` and `seek()` instead. */ - public function resume( $paused_state ) { - $this->bytes_already_parsed = 0; - $this->bytes_already_forgotten = $paused_state['bytes_already_forgotten']; - $this->stack_of_open_elements = $paused_state['stack_of_open_elements']; - $this->parser_context = $paused_state['parser_context']; - $this->expecting_more_input = $paused_state['expecting_more_input']; - $this->next_token(); + public function get_token_byte_offset_in_the_input_stream() { + return $this->token_starts_at + $this->upstream_bytes_forgotten; + } + + protected function initialize_from_cursor( $cursor ) { + if ( ! is_string( $cursor ) ) { + _doing_it_wrong( __METHOD__, 'Cursor must be a JSON-encoded string.', '1.0.0' ); + return false; + } + $cursor = base64_decode( $cursor ); + if ( false === $cursor ) { + _doing_it_wrong( __METHOD__, 'Invalid cursor provided to initialize_from_cursor().', '1.0.0' ); + return false; + } + $cursor = json_decode( $cursor, true ); + if ( false === $cursor ) { + _doing_it_wrong( __METHOD__, 'Invalid cursor provided to initialize_from_cursor().', '1.0.0' ); + return false; + } + // Assume the input stream will start from the last known byte offset. + $this->bytes_already_parsed = 0; + $this->upstream_bytes_forgotten = $cursor['upstream_bytes_forgotten']; + $this->stack_of_open_elements = $cursor['stack_of_open_elements']; + $this->parser_context = $cursor['parser_context']; + $this->expecting_more_input = $cursor['expecting_more_input']; + return true; } /** @@ -796,7 +831,7 @@ public function flush_processed_xml() { if ( null !== $this->text_starts_at ) { $this->text_starts_at -= $unreferenced_bytes; } - $this->bytes_already_forgotten += $unreferenced_bytes; + $this->upstream_bytes_forgotten += $unreferenced_bytes; return $flushed_bytes; } @@ -1560,7 +1595,7 @@ private function parse_next_tag() { */ if ( 0 === $at && - 0 === $this->bytes_already_forgotten && + 0 === $this->upstream_bytes_forgotten && ! $this->is_closing_tag && '?' === $xml[ $at + 1 ] && 'x' === $xml[ $at + 2 ] && @@ -3069,7 +3104,15 @@ private function step_in_element( $node_to_process = self::PROCESS_NEXT_NODE ) { return true; default: $this->last_error = self::ERROR_SYNTAX; - _doing_it_wrong( __METHOD__, 'Unexpected token type in element stage.', 'WP_VERSION' ); + _doing_it_wrong( + __METHOD__, + sprintf( + // translators: %1$s is the unexpected token type. + __( 'Unexpected token type "%1$s" in element stage.', 'data-liberation' ), + $this->get_token_type() + ), + 'WP_VERSION' + ); return false; } } diff --git a/packages/playground/data-liberation/tests/WPWXRReaderTests.php b/packages/playground/data-liberation/tests/WPWXRReaderTests.php index 2b08768080..881749cf81 100644 --- a/packages/playground/data-liberation/tests/WPWXRReaderTests.php +++ b/packages/playground/data-liberation/tests/WPWXRReaderTests.php @@ -8,7 +8,7 @@ class WPWXRReaderTests extends TestCase { * @dataProvider preexisting_wxr_files_provider */ public function test_does_not_crash_when_parsing_preexisting_wxr_files_as_string($path, $expected_entitys) { - $wxr = new WP_WXR_Reader(); + $wxr = WP_WXR_Reader::create(); $wxr->append_bytes(file_get_contents($path)); $wxr->input_finished(); @@ -25,7 +25,7 @@ public function test_does_not_crash_when_parsing_preexisting_wxr_files_as_string */ public function test_does_not_crash_when_parsing_preexisting_wxr_files_as_stream($path, $expected_entitys) { $stream = fopen($path, 'r'); - $wxr = new WP_WXR_Reader(); + $wxr = WP_WXR_Reader::create(); $found_entities = 0; while(true) { $chunk = fread($stream, 100); @@ -64,7 +64,7 @@ public function preexisting_wxr_files_provider() { public function test_simple_wxr() { - $importer = new WP_WXR_Reader(); + $importer = WP_WXR_Reader::create(); $importer->append_bytes(file_get_contents(__DIR__ . '/fixtures/wxr-simple.xml')); $importer->input_finished(); $this->assertTrue( $importer->next_entity() ); @@ -182,7 +182,7 @@ public function test_simple_wxr() { } public function test_attachments() { - $importer = new WP_WXR_Reader(); + $importer = WP_WXR_Reader::create(); $importer->append_bytes(<< @@ -265,7 +265,7 @@ public function test_attachments() { } public function test_terms() { - $importer = new WP_WXR_Reader(); + $importer = WP_WXR_Reader::create(); $importer->append_bytes(<< @@ -300,7 +300,7 @@ public function test_terms() { } public function test_category() { - $importer = new WP_WXR_Reader(); + $importer = WP_WXR_Reader::create(); $importer->append_bytes(<< @@ -331,7 +331,7 @@ public function test_category() { } public function test_tag_string() { - $importer = new WP_WXR_Reader(); + $importer = WP_WXR_Reader::create(); $importer->append_bytes(<< @@ -379,7 +379,7 @@ public function test_tag_streaming() { XML; $chunks = str_split($wxr, 10); - $wxr = new WP_WXR_Reader(); + $wxr = WP_WXR_Reader::create(); while(true) { if(true === $wxr->next_entity()) { break; @@ -411,7 +411,7 @@ public function test_tag_streaming() { } public function test_parse_comment() { - $wxr = new WP_WXR_Reader(); + $wxr = WP_WXR_Reader::create(); $wxr->append_bytes(<< @@ -494,7 +494,7 @@ public function test_parse_comment() { } public function test_retains_last_ids() { - $wxr = new WP_WXR_Reader(); + $wxr = WP_WXR_Reader::create(); $wxr->append_bytes(<< diff --git a/packages/playground/data-liberation/tests/WPXMLProcessorTests.php b/packages/playground/data-liberation/tests/WPXMLProcessorTests.php index faee0c31a9..9a48413631 100644 --- a/packages/playground/data-liberation/tests/WPXMLProcessorTests.php +++ b/packages/playground/data-liberation/tests/WPXMLProcessorTests.php @@ -1735,13 +1735,15 @@ public function test_pause_and_resume() { $processor->next_tag(); $processor->next_tag(); $this->assertEquals( 'first_child', $processor->get_tag(), 'Did not find a tag.' ); - $paused_state = $processor->pause(); - $this->assertEquals( 10, $paused_state['token_byte_offset_in_the_input_stream'], 'Wrong position in the input stream exported.' ); + + $entity_offset = $processor->get_token_byte_offset_in_the_input_stream(); + $cursor = $processor->get_reentrancy_cursor(); $resumed = WP_XML_Processor::create_for_streaming( - substr( $xml, $paused_state['token_byte_offset_in_the_input_stream'] ) + substr( $xml, $entity_offset ), + $cursor ); - $resumed->resume( $paused_state ); + $resumed->next_tag(); $this->assertEquals( 'first_child', $resumed->get_tag(), 'Did not find a tag.' ); $resumed->next_token(); $this->assertEquals( 'Hello there', $resumed->get_modifiable_text(), 'Did not find the expected text.' ); diff --git a/packages/playground/data-liberation/tests/import/blueprint-import.json b/packages/playground/data-liberation/tests/import/blueprint-import.json index c0ec10766a..d34478b3aa 100644 --- a/packages/playground/data-liberation/tests/import/blueprint-import.json +++ b/packages/playground/data-liberation/tests/import/blueprint-import.json @@ -2,6 +2,7 @@ "$schema": "../../../blueprints/public/blueprint-schema.json", "constants": { "WP_DEBUG": true, + "WP_DEBUG_DISPLAY": true, "WP_DEBUG_LOG": true }, "login": true,
Date