diff --git a/README.md b/README.md index cf8b0b5..bc131ee 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Install as usual, see [this](https://drupal.org/documentation/install/modules-th Set 'Last Modified Solr Field' and 'Maximum number of Islandora links to process at once' in Administration » Islandora » XML Sitemap Integration (admin/islandora/xmlsitemap). -![Configuration](https://camo.githubusercontent.com/407972e0a2c14bafd74924992c659021b800abb0/687474703a2f2f692e696d6775722e636f6d2f455a534f4b68372e706e67) +![Configuration](https://user-images.githubusercontent.com/2461961/35802085-ffafc758-0a6e-11e8-8a0c-e1f09e4d2a45.png) ### Notes @@ -48,6 +48,16 @@ Please also note that objects marked as "inactive", whether manually or by using Larger sites with greater than 100,000 objects may encounter issues during the sitemap building process with the default configuration, such as the process hanging around a specific number indefinitely or exiting the process entirely before completion. These users may want to try unchecking the "Prefetch URL aliases during sitemap generation" option found on the xmlsitemap admin configuration page (/admin/config/search/xmlsitemap/settings) and trying the process again. +## Bulk generation using drush +There is a Drush command available for the generation of sitemap links. This command allows you to optionally fetch a limited amount of objects (`limit`) similar to the hook_cron() and lets you define a custom amount to be fetched at once (`max_chunk_size`) from SOLR. + +Command: +`drush islandora_xmlsitemap_generate [--max_chunk_size=100] [--limit=1000] [--regenerate]` + +* The `max_chunk_size` defaults to 100 +* If no `limit` is set all objects will be processed +* The `--regenerate` flag removes the "last_modified" value so will cause processing to start at the beginning. *Use with caution if you have lots of objects*. + ## Documentation This module's documentation is also available at [our wiki](https://wiki.duraspace.org/display/ISLANDORA/Islandora+XML+Sitemap). diff --git a/includes/admin.form.inc b/includes/admin.form.inc index 4fe19d3..7e2560e 100644 --- a/includes/admin.form.inc +++ b/includes/admin.form.inc @@ -34,13 +34,22 @@ function islandora_xmlsitemap_admin_form($form, &$form_state) { '#description' => t('Solr field in which we can perform sorting and range queries.'), '#default_value' => variable_get('islandora_xmlsitemap_last_modified_field', 'fgs_lastModifiedDate_dt'), ); - $form['islandora_xmlsitemap_number_of_pids_to_process'] = array( + $form['islandora_xmlsitemap_generate_limit_cron'] = array( '#type' => 'textfield', - '#title' => 'Maximum Number of Islandora links to process at once', + '#title' => 'Maximum Number of Islandora links to process during hook_cron()', '#size' => 10, '#element_validate' => array('element_validate_integer_positive'), - '#default_value' => variable_get('islandora_xmlsitemap_number_of_pids_to_process', 1000), - '#description' => 'This is the number of Islandora/Fedora links we will process at once', + '#default_value' => variable_get('islandora_xmlsitemap_generate_limit_cron', 1000), + '#description' => 'This is the maximum number of Islandora/Fedora links we will automatically process when hook_cron() is called', + ); + + $form['islandora_xmlsitemap_generate_chunk_size'] = array( + '#type' => 'textfield', + '#title' => 'Amount of Islandora links to fetch and process at once', + '#size' => 10, + '#element_validate' => array('element_validate_integer_positive'), + '#default_value' => variable_get('islandora_xmlsitemap_generate_chunk_size', 100), + '#description' => 'This is the number of Islandora/Fedora links we will fetch and process at once (before updating the Last Modified value) when using the buttons below or when hook_cron() is called', ); $form['actions'] = array( @@ -74,7 +83,8 @@ function islandora_xmlsitemap_admin_form_submit(&$form, &$form_state) { $to_set = array( 'islandora_xmlsitemap_last_modified_field', - 'islandora_xmlsitemap_number_of_pids_to_process', + 'islandora_xmlsitemap_generate_limit_cron', + 'islandora_xmlsitemap_generate_chunk_size' ); if ($button == 'generate' || $button == 'regenerate') { if ($button == 'regenerate') { @@ -82,7 +92,10 @@ function islandora_xmlsitemap_admin_form_submit(&$form, &$form_state) { } module_load_include('inc', 'islandora_xmlsitemap', 'includes/batch'); - $batch = islandora_xmlsitemap_get_batch(100, -1); + $batch = islandora_xmlsitemap_get_batch( + variable_get('islandora_xmlsitemap_generate_chunk_size', 100), + NULL + ); batch_set($batch); } elseif ($button == 'submit') { diff --git a/includes/batch.inc b/includes/batch.inc index b2b2d9b..e7ae112 100644 --- a/includes/batch.inc +++ b/includes/batch.inc @@ -1,5 +1,4 @@ t('Time elapsed: @elapsed
Estimated time remaining @estimate.'), 'operations' => array( - array('islandora_xmlsitemap_batch_operation', array($jump, $cutoff)), + // Using a single operation that will be called multiple times + array('islandora_xmlsitemap_batch_operation', array($max_chunk_size, $batch_limit)), ), 'file' => "$mod_path/includes/batch.inc", + 'finished' => 'islandora_xmlsitemap_finished' ); return $batch; } /** - * Batch operation. + * Batch operation. Grab a number of records from Solr at a time. + * + * This operation will be called multiple times until batch limit is reached or + * there are no records newer than the last_modified date that was defined. + * + * Gets variables: + * - islandora_xmlsitemap_last_modified_value (starting point for query) + * - islandora_xmlsitemap_last_modified_field (field to look for + * last_modified_value to match against) + * - islandora_namespace_restriction_enforced * - * Grab a number of records from Solr at a time, until we hit the cutoff - * number. + * Sets variable: + * - islandora_xmlsitemap_last_modified_value * - * @param int $jump - * The number of records to grab each iteration. - * @param int $cutoff - * The number of records to process per batch. + * @param int $chunk_size + * The number of records to grab each iteration of the operation. + * + * @param int $batch_limit + * The number of records to grab before ending the batch process. + * + * @param array &$context + * Reference to persistent context of the operation. */ -function islandora_xmlsitemap_batch_operation($jump, $cutoff, &$context) { +function islandora_xmlsitemap_batch_operation($max_chunk_size = 100, $batch_limit = null, &$context){ + // Create sandbox alias $sandbox =& $context['sandbox']; - if (!isset($sandbox['offset'])) { - $sandbox['offset'] = 0; - - $sandbox['last_modified'] = variable_get('islandora_xmlsitemap_last_modified_value', NULL); - - $date_field = variable_get('islandora_xmlsitemap_last_modified_field', 'fgs_lastModifiedDate_dt'); - - $qp = $sandbox['query_processor'] = new IslandoraSolrQueryProcessor(); - $qp->solrLimit = $cutoff > 0 ? - min($jump, $cutoff) : - $jump; - $qp->solrQuery = isset($sandbox['last_modified']) ? - "$date_field:[{$sandbox['last_modified']} TO *]" : - "$date_field:[* TO *]"; - $qp->solrParams['sort'] = "$date_field asc"; - $qp->solrParams['fq'] = array(); - $qp->solrParams['fl'] = 'PID'; - - $namespaces_enforced = variable_get('islandora_namespace_restriction_enforced', FALSE); - if ($namespaces_enforced) { - $namespace_map = function ($namespace) { - return 'PID:' . Apache_Solr_Service::escape("$namespace:") . '*'; - }; - module_load_include('inc', 'islandora', 'includes/utilities'); - $qp->solrParams['fq'][] = implode(' OR ', array_map($namespace_map, islandora_get_allowed_namespaces())); + /* + * Sandbox setup (only on first call of operation). + */ + if (empty($sandbox)) { + $sandbox = array(); + + // The amount of rows processed. + $sandbox['progress'] = 0; + + // The PID of the latest processed row. + $sandbox['current_node'] = "None"; + + // Represents total to be retrieved during this batch process. Set after initial results are retrieved. + $sandbox['total'] = null; + + // Set a batch limit if specified + if (isset($batch_limit)) { + $sandbox['batch_limit'] = (int) $batch_limit; } + + // Boostrap an IslandoraSolrQueryProcessor for our purposes. + $last_modified = $sandbox['last_modified'] = variable_get('islandora_xmlsitemap_last_modified_value', NULL); + $last_modified_field = variable_get('islandora_xmlsitemap_last_modified_field', 'fgs_lastModifiedDate_dt'); + $enforce_namespaces = variable_get('islandora_namespace_restriction_enforced', FALSE); + $sandbox['queryProcessor'] = islandora_xmlsitemap_queryProcessor_create($last_modified_field, $last_modified, $enforce_namespaces); + + // Set maximum chunk size. + $sandbox['max_chunk_size'] = (int) $max_chunk_size; + + // Set default SOLR limit. + $sandbox['queryProcessor']->solrLimit = $sandbox['max_chunk_size']; } - else { - $sandbox['offset'] += $jump; - $qp = $sandbox['query_processor']; + + /* + * Iteration start. + */ + + // Set SOLR offset to where we left off in the previous iteration. + $sandbox['queryProcessor']->solrStart = $sandbox['progress']; + + // Set chunk size for limited batches. This prevents overshooting + // the batch_limit. + if (isset($sandbox['batch_limit'])) { + // Max chunk size for limited batches. + $sandbox['max_chunk_size'] = min($sandbox['batch_limit'] - $sandbox['progress'], $sandbox['max_chunk_size']); + + // Update SOLR limit. + $sandbox['queryProcessor']->solrLimit = $sandbox['max_chunk_size']; } - $qp->solrStart = $sandbox['offset']; + // Execute SOLR query and get result. + $sandbox['queryProcessor']->executeQuery(FALSE, TRUE); + $result = $sandbox['queryProcessor']->islandoraSolrResult; + $resultCount = (int) $result['response']['numFound']; - // Query for $count PIDs, starting from $offset. - $qp->executeQuery(FALSE, TRUE); - $results = $qp->islandoraSolrResult; + // Result will be null in case of SOLR error. The queryprocessor will have set + // an error using drupal_set_message. + $hasSolrError = ($result === NULL); - $sandbox['total'] = $cutoff > 0 ? - min((int) $results['response']['numFound'], $cutoff) : - (int) $results['response']['numFound']; - if ($results === NULL || $sandbox['total'] === 0) { - $context['message'] = t('No results selected, or errored...'); - // Stash the most current value, so we can hopefully pick up where we left - // off last time. - variable_set('islandora_xmlsitemap_last_modified_value', $sandbox['last_modified']); + // Catch a result that does not have any rows + $noResults = ($resultCount === 0); + + // Check if we encountered error conditions + if ($hasSolrError || $noResults) { + // SOLR error + if ($hasSolrError) { + // Set message indicating a SOLR error. + $context['message'] = t('SOLR error'); + } + + // No results + if ($noResults) { + // Set message indicating no results. + $context['message'] = t('No results'); + } + + // Add error to the result list. + $context['results'][] = $context['message']; + + // Set finished to 1 so the batch process will not loop endlessly. + $context['finished'] = 1; + + // Return control to batch engine. return; } - // Add/update each result in the custom table. + // Set total based on results (only once every batch). + if (!isset($sandbox['total'])) { + if (isset($sandbox['batch_limit'])) { + // Limited batch total is batch_limit or resultcount if there are less + // results than the batch_limit. + $sandbox['total'] = min($resultCount, $sandbox['batch_limit']); + }else{ + // Unlimited batch total is equal to the number of results. + $sandbox['total'] = $resultCount; + } + } + + // Process result rows. module_load_include('inc', 'islandora_xmlsitemap', 'includes/utilities'); - foreach ($results['response']['objects'] as $result) { - islandora_xmlsitemap_add_or_update_link($result['PID'], $sandbox); + $resultRows = $result['response']['objects']; + foreach ($resultRows as $row) { + // Add or Update link. + islandora_xmlsitemap_add_or_update_link($row['PID'], $sandbox); + + // Update our progress information. + $sandbox['progress']++; + $sandbox['current_node'] = $row['PID']; + + // Log result for post-processing in the 'finished' callback. + $context['results'][] = "Add/update " . $sandbox['current_node']; } - $qp->resetResults(); + //Drop results when finished with them. + $sandbox['queryProcessor']->resetResults(); - $context['message'] = t('Processed @count of @total.', array( - '@count' => min($sandbox['offset'] + $jump, $sandbox['total']), + // Set message to be returned. + $context['message'] = t('Processed @count of @total. Current node: @pid.', array( + '@count' => $sandbox['progress'], '@total' => $sandbox['total'], + '@pid' => $sandbox['current_node'] )); - $context['finished'] = $sandbox['offset'] / $sandbox['total']; - if ($context['finished'] >= 1) { + + // Persist the last processed records last modified date. + // This allows facilitates continuing from this point onward next time in case + // not all records have been processed due to limits set or a crash. + if (isset($sandbox['last_modified'])) { variable_set('islandora_xmlsitemap_last_modified_value', $sandbox['last_modified']); } + + // Inform the batch engine of our progress, + // and provide an estimation of the completion level we reached. + $context['finished'] = $sandbox['progress'] / $sandbox['total']; +} + +/** + * Bootstraps a new queryprocessor for Islandora XMLsitemap. + * + * @param string $last_modified_field + * The date field to use for sorting and filtering. + * + * @param string $last_modified + * The last modified date. + * + * @param boolean $enforce_namespaces + * Enforce Islandora namespaces or not. If enabled allowed namespaces will be + * retrieved using islandora_get_allowed_namespaces(). Defaults to FALSE. + * + * @return IslandoraSolrQueryProcessor Bootstrapped SOLR query processor. + */ +function islandora_xmlsitemap_queryProcessor_create($last_modified_field, $last_modified = NULL, $enforce_namespaces = FALSE) { + // Instantiate Islandora Query Processor. + $qp = new IslandoraSolrQueryProcessor(); + + // Set sorting on the $last_modified_field field, ascending so we go from older to newer records. + $qp->solrParams['sort'] = "$last_modified_field asc"; + + // Set empty filter query. + $qp->solrParams['fq'] = array(); + + // Set or update the latest last_modified value. + $qp->solrQuery = islandora_xmlsitemap_queryProcessor_last_modified_query( + $last_modified_field, + $last_modified + ); + + // Return only the PID field. + $qp->solrParams['fl'] = 'PID'; + + // Enforce namespace if required by settings. + if ($enforce_namespaces) { + $namespace_map = function ($namespace) { + return 'PID:' . Apache_Solr_Service::escape("$namespace:") . '*'; + }; + module_load_include('inc', 'islandora', 'includes/utilities'); + $qp->solrParams['fq'][] = implode(' OR ', array_map($namespace_map, islandora_get_allowed_namespaces())); + } + + return $qp; +} + +/** + * Creates a SOLR query string based on the last modified value. + * + * @param string $last_modified_field Last modified date field. + * @param string $last_modified Last modified date. + * + * @return string SOLR query string. + */ +function islandora_xmlsitemap_queryProcessor_last_modified_query($last_modified_field, $last_modified = NULL) { + if (isset($last_modified)) { + return "$last_modified_field:[{$last_modified} TO *]"; + } + return "$last_modified_field:[* TO *]"; +} + +/** + * Finished callback. + * + * Displays result or errors when the batch finishes. + */ +function islandora_xmlsitemap_finished($success, $results, $operations) { + if ($success) { + // Display the result. + drupal_set_message(t('@count results', array('@count' => count($results)))); + drupal_set_message(t('The last result was "%final"', array('%final' => end($results)))); + } + else { + // Display an error. + drupal_set_message( + t( + 'An error occurred while processing. The last result was: %final. Batch arguments: @args', + array( + '%final' => end($results), + '@args' => print_r($error_operation[0], TRUE) + ) + ), + 'error' + ); + } } diff --git a/islandora_xmlsitemap.drush.inc b/islandora_xmlsitemap.drush.inc new file mode 100644 index 0000000..49e6745 --- /dev/null +++ b/islandora_xmlsitemap.drush.inc @@ -0,0 +1,99 @@ + 'Generate XMLsitemap records.', + 'options' => array( + 'max_chunk_size' => 'The number of records to grab each SOLR request', + 'limit' => 'The total number of records to grab before ending the process (defaults to no limit)', + 'regenerate' => 'If set will clear the last_modified date before generating.' + ) + ); + return $items; +} + +/** + * Validate the input for the islandora_xmlsitemap_generate command. + * + * Implements drush_hook_COMMAND_validate(). + */ +function drush_islandora_xmlsitemap_generate_validate() { + // Validate max_chunk_size option + $max_chunk_size = drush_get_option('max_chunk_size', NULL); + if ($max_chunk_size !== NULL) { + if (!islandora_xmlsitemap_is_integer_positive($max_chunk_size)) { + return drush_set_error('The option "max_chunk_size" must be a positive integer greater than 0.'); + } + } + + // Validate limit option + $limit = drush_get_option('limit', NULL); + if ($limit !== NULL) { + if (!islandora_xmlsitemap_is_integer_positive($limit)) { + return drush_set_error('The option "limit" must be a positive integer greater than 0.'); + } + } + + // Validate regenerate option + $regenerate = drush_get_option('regenerate', NULL); + if ($regenerate !== NULL) { + if ($regenerate !== TRUE) { + return drush_set_error('The option "regenerate" can only be used without a value (--regenerate)'); + } + } + + return TRUE; +} + +/** + * Execute the islandora_xmlsitemap_generate command. + * + * Implements drush_hook_COMMAND(). + */ +function drush_islandora_xmlsitemap_generate() { + // Remove last_modified value if we need to regenerate. + $regenerate = drush_get_option('regenerate', NULL); + if (isset($regenerate)) { + variable_del('islandora_xmlsitemap_last_modified_value'); + } + + // Get the batch + module_load_include('inc', 'islandora_xmlsitemap', 'includes/batch'); + $batch = islandora_xmlsitemap_get_batch(drush_get_option('max_chunk_size', 100), drush_get_option('limit', NULL)); + + // Set and configure the batch. + batch_set($batch); + $batch =& batch_get(); + $batch['progressive'] = FALSE; + + // Start processing + drush_backend_batch_process(); +} + +/** + * Check if a (string) value is both not empty, an integer and positive. + * + * Validation logic taken from: + * https://api.drupal.org/api/drupal/includes%21form.inc/function/element_validate_integer_positive/7.x + * + * @return boolean + * TRUE if $value is a positive integer, FALSE if $value is not a positive integer + */ +function islandora_xmlsitemap_is_integer_positive($value) { + if ($value !== '' && (!is_numeric($value) || intval($value) != $value || $value <= 0)) { + return FALSE; + } + return TRUE; +} diff --git a/islandora_xmlsitemap.module b/islandora_xmlsitemap.module index aac7c90..cf74a18 100644 --- a/islandora_xmlsitemap.module +++ b/islandora_xmlsitemap.module @@ -10,7 +10,11 @@ define('ISLANDORA_XMLSITEMAP_PATH_PREFIX', 'islandora/object/'); */ function islandora_xmlsitemap_cron() { module_load_include('inc', 'islandora_xmlsitemap', 'includes/batch'); - xmlsitemap_run_unprogressive_batch('islandora_xmlsitemap_get_batch'); + xmlsitemap_run_unprogressive_batch( + 'islandora_xmlsitemap_get_batch', + variable_get('islandora_xmlsitemap_generate_chunk_size', 100), + variable_get('islandora_xmlsitemap_generate_limit_cron', 1000) + ); } /**