Not sure if it is of broad interest, but posting it here anyway:

Use the following custom fetcher based on FeedsFileFecher to import data from an archive with an xml-feed source and image files. Create your custom module and create a plugin (i.e. in my_module/plugins/FeedsArchiveFileFetcher.inc) with the following code:


/**
 * @file
 * Home of the FeedsArchiveFileFetcher and related classes.
 */

/**
 * Definition of the import batch object created on the fetching stage by
 * FeedsArchiveFileFetcher.
 */
class FeedsArchiveFileFetcherResult extends FeedsFileFetcherResult {

  /**
   * Overrides parent::getRaw().
   */
  public function getRaw() {
    // Not implemented;
    return;
  }
}

/**
 * Fetches data via HTTP.
 */
class FeedsArchiveFileFetcher extends FeedsFileFetcher {

  public $processed_archives;

  /**
   * Implements FeedsFetcher::fetch().
   *
   * If there are multiple archives to be imported from, this function will get
   * called multiple times.
   */
  public function fetch(FeedsSource $source) {
    $source_config = $source->getConfigFor($this);
    $state = $source->state(FEEDS_FETCH);

    // If this is a file, just return a FeedsFileFetcherResult for the
    // containing feed source.
    if (is_file($source_config['source'])) {
      try {
        $state->extract_dir = $this->extractFeedArchive($source_config['source']);
        $feed_source_file = $this->getFeedSourceFile($state->extract_dir);
        // Store processed archive and directory for postprocessing,
        // i.e. to delete archive and temp. directory.
        $state->processed_archives[$source_config['source']] = $state->extract_dir;
      }
      catch (FeedsNotExistingException $e) {
        // Do nothing.
      }
      return new FeedsFileFetcherResult($feed_source_file);
    }

    // Batch if this is a directory.
    $files = array();
    if (!isset($state->files)) {
      $state->files = $this->listFiles($source_config['source']);
      $state->total = count($state->files);
    }
    if (count($state->files)) {
      // Get next file.
      $archive_file = array_shift($state->files);
      // Set batch progress.
      $state->progress($state->total, $state->total - count($state->files));
      // Get the feed source from the archive.
      $state->extract_dir = $this->extractFeedArchive($archive_file);
      $state->current_archive_file = $archive_file;
      $feed_source_file = $this->getFeedSourceFile($state->extract_dir);
      // Store processed archive and directory for postprocessing,
      // i.e. to delete archive and temp. directory.
      $state->processed_archives[$archive_file] = $state->extract_dir;

      $fetcher_result = new FeedsFileFetcherResult($feed_source_file);
      module_invoke_all('feeds_after_fetch_archive_file', $source, $fetcher_result);

      return $fetcher_result;
    }

    throw new Exception(t('Resource is not a file or it is an empty directory: %source', array('%source' => $source_config['source'])));
  }

  /**
   * Returns an array of files in a directory.
   *
   * @param string $dir
   *   A stream wreapper URI that is a directory.
   *
   * @return array
   *   An array of stream wrapper URIs pointing to files. The array is empty if
   *   no files could be found. Never contains directories.
   */
  protected function listFiles($dir) {
    $dir = file_stream_wrapper_uri_normalize($dir);
    $files = array();
    if ($items = @scandir($dir)) {
      // Scan for supported archives only.
      $extensions = archiver_get_extensions();
      // Regex from file_validate_extensions().
      $regex = '/\.(' . preg_replace('/ +/', '|', preg_quote($extensions)) . ')$/i';
      foreach ($items as $item) {
        if (is_file("$dir/$item") && strpos($item, '.') !== 0 && preg_match($regex, $item)) {
          $files[] = "$dir/$item";
        }
      }
    }
    return $files;
  }

  /**
   * Extract the archive to a temporary directory.
   *
   * @param [type] $archive_file [description]
   *
   * @return [type] [description]
   */
  protected function extractFeedArchive($archive_file){
    $archiver = archiver_get_archiver($archive_file);

    if (!$archiver) {
      throw new Exception(t('Archive type of %archive_file is not supported.', array('%archive_file' => $archive_file)));
    }

    // Extract archive in temporary directory.
    $directory = 'temporary://feeds-importer-' . $this->id . '-' . date('Ymd-Hi-') . substr(drupal_hash_base64(drupal_random_bytes(8)), 0, 8);
    if (!file_exists($directory)) {
      mkdir($directory);
    }
    $archiver->extract($directory);

    return $directory;
  }

  /**
   * Gets the feed source file.
   *
   * @param string $extract_dir
   *  The directory where the archive was extracted to.
   *
   * @return string
   *  The feed source.
   */
  protected function getFeedSourceFile($extract_dir) {
    // TODO: replace placeholders i.e. %archive-name in "%archive-name.xml"
    $feed_source_file = $this->config['feed_source_file_name'];

    if (file_exists($extract_dir . '/' . $feed_source_file)) {
      return $extract_dir . '/' . $feed_source_file;
    }
  }

  /**
   * Source form.
   */
  public function sourceForm($source_config) {
    $form = array();
    $form['fid'] = array(
      '#type' => 'value',
      '#value' => empty($source_config['fid']) ? 0 : $source_config['fid'],
    );
    if (empty($this->config['archive_direct'])) {
      $form['source'] = array(
        '#type' => 'value',
        '#value' => empty($source_config['source']) ? '' : $source_config['source'],
      );
      $form['upload'] = array(
        '#type' => 'file',
        '#title' => empty($this->config['archive_direct']) ? t('File') : NULL,
        '#description' => empty($source_config['source']) ? t('Select an archive file from your local system.') : t('Select a different archive file from your local system.'),
        '#theme' => 'feeds_upload',
        '#file_info' => empty($source_config['fid']) ? NULL : file_load($source_config['fid']),
        '#size' => 10,
      );
    }
    else {
      $form['source'] = array(
        '#type' => 'textfield',
        '#title' => t('File or directory'),
        '#description' => t('Specify a path to a file or a directory. Prefix the path with a scheme. Available schemes: @schemes.', array('@schemes' => implode(', ', $this->config['archive_allowed_schemes']))),
        '#default_value' => empty($source_config['source']) ? $this->config['archive_directory'] : $source_config['source'],
      );
    }
    return $form;
  }

  /**
   * Overrides parent::sourceFormValidate().
   */
  public function sourceFormValidate(&$values) {
    $values['source'] = trim($values['source']);

    // Uploaded File
    if (empty($this->config['archive_direct'])) {

      $feed_dir = $this->config['archive_directory'];

      if (!file_prepare_directory($feed_dir, FILE_CREATE_DIRECTORY | FILE_MODIFY_PERMISSIONS)) {
        if (user_access('administer feeds')) {
          $plugin_key = feeds_importer($this->id)->config[$this->pluginType()]['plugin_key'];
          $link = url('admin/structure/feeds/' . $this->id . '/settings/' . $plugin_key);
          form_set_error('feeds][FeedsArchiveFileFetcher][source', t('Upload failed. Please check the upload <a href="@link">settings.</a>', array('@link' => $link)));
        }
        else {
          form_set_error('feeds][FeedsArchiveFileFetcher][source', t('Upload failed. Please contact your site administrator.'));
        }
        watchdog('feeds', 'The upload directory %directory required by a feed could not be created or is not accessible. A newly uploaded file could not be saved in this directory as a consequence, and the upload was canceled.', array('%directory' => $feed_dir));
      }
      // Validate and save uploaded file.
      elseif ($file = file_save_upload('feeds', array('file_validate_extensions' => array(archiver_get_extensions())), $feed_dir)) {
        $values['source'] = $file->uri;
        $values['file'] = $file;
      }
      elseif (empty($values['source'])) {
        form_set_error('feeds][FeedsArchiveFileFetcher][source', t('Please upload a file.'));
      }
      else {
        // File present from previous upload. Nothing to validate.
      }
    }
    // Path to a file o directory of files
    else {
      // Check if chosen url scheme is allowed.
      $scheme = file_uri_scheme($values['source']);
      if (!$scheme || !in_array($scheme, $this->config['archive_allowed_schemes'])) {
        form_set_error('feeds][FeedsArchiveFileFetcher][source', t("The file needs to reside within the site's files directory, its path needs to start with scheme://. Available schemes: @schemes.", array('@schemes' => implode(', ', $this->config['archive_allowed_schemes']))));
      }
      // Check wether the given path exists.
      elseif (!file_exists($values['source'])) {
        form_set_error('feeds][FeedsArchiveFileFetcher][source', t('The specified file or directory does not exist.'));
      }
    }
  }

  /**
   * Overrides parent::configDefaults().
   */
  public function configDefaults() {
    $schemes = $this->getSchemes();
    $scheme = in_array('private', $schemes) ? 'private' : 'public';

    return array(
      'archive_direct' => FALSE,
      'archive_directory' => $scheme . '://feeds',
      'archive_allowed_schemes' => $schemes,
      'feed_source_allowed_extensions' => 'txt csv tsv xml opml',
      'feed_source_file_name' => 'feed.xml',
    );
  }

  /**
   * Overrides parent::configForm().
   */
  public function configForm(&$form_state) {
    $form = array();

    $form['archive'] = array(
      '#type' => 'fieldset',
      '#title' => t('Feed archive file'),
    );
    $form['archive']['supported_extensions'] = array(
      '#type' => 'item',
      '#title' => t('Supported archive types'),
      '#markup' => archiver_get_extensions(),
    );
    $form['archive']['archive_direct'] = array(
      '#type' => 'checkbox',
      '#title' => t('Supply path to file or directory directly'),
      '#description' => t('For experts. Lets users specify a path to a file <em>or a directory of files</em> directly,
        instead of a file upload through the browser. This is useful when the files that need to be imported
        are already on the server.'),
      '#default_value' => $this->config['archive_direct'],
    );
    $form['archive']['archive_directory'] = array(
      '#type' => 'textfield',
      '#title' => t('Upload directory'),
      '#description' => t('Directory where uploaded files get stored. Prefix the path with a scheme. Available schemes: @schemes.', array('@schemes' => implode(', ', $this->getSchemes()))),
      '#default_value' => $this->config['archive_directory'],
      '#states' => array(
        'visible' => array(':input[name="direct"]' => array('checked' => FALSE)),
        'required' => array(':input[name="direct"]' => array('checked' => FALSE)),
      ),
    );
    if ($options = $this->getSchemeOptions()) {
      $form['archive']['archive_allowed_schemes'] = array(
        '#type' => 'checkboxes',
        '#title' => t('Allowed schemes'),
        '#default_value' => $this->config['archive_allowed_schemes'],
        '#options' => $options,
        '#description' => t('Select the schemes you want to allow for direct upload.'),
        '#states' => array(
          'visible' => array(':input[name="direct"]' => array('checked' => TRUE)),
        ),
      );
    }

    $form['feed_source'] = array(
      '#type' => 'fieldset',
      '#title' => t('Feed source file inside the archive'),
    );
    $form['feed_source']['feed_source_allowed_extensions'] = array(
      '#type' => 'textfield',
      '#title' => t('Allowed file extensions'),
      '#description' => t('Allowed file extensions for feed source file.'),
      '#default_value' => $this->config['feed_source_allowed_extensions'],
      '#required' => TRUE,
    );
    $form['feed_source']['feed_source_file_name'] = array(
      '#type' => 'textfield',
      '#title' => t('File name'),
      '#description' => t('Name of feed source file inside archive.'),
      '#default_value' => $this->config['feed_source_file_name'],
    );

    return $form;
  }

  /**
   * Overrides parent::configFormValidate().
   *
   * Ensure that the chosen directory is accessible.
   */
  public function configFormValidate(&$values) {
    $values['archive_directory'] = trim($values['archive_directory']);
    $values['archive_allowed_schemes'] = array_filter($values['archive_allowed_schemes']);

    if (!$values['archive_direct']) {
      // Ensure that the upload directory field is not empty when not in
      // direct-mode.
      if (!$values['archive_directory']) {
        form_set_error('archive_directory', t('Please specify an upload directory.'));
        // Do not continue validating the directory if none was specified.
        return;
      }

      // Validate the URI scheme of the upload directory.
      $scheme = file_uri_scheme($values['archive_directory']);
      if (!$scheme || !in_array($scheme, $this->getSchemes())) {
        form_set_error('archive_directory', t('Please enter a valid scheme into the directory location.'));

        // Return here so that attempts to create the directory below don't
        // throw warnings.
        return;
      }

      // Ensure that the upload directory exists.
      if (!file_prepare_directory($values['archive_directory'], FILE_CREATE_DIRECTORY | FILE_MODIFY_PERMISSIONS)) {
        form_set_error('archive_directory', t('The chosen directory does not exist and attempts to create it failed.'));
      }
    }

    // Ensure feed source file name has valid extension.
    if ($values['feed_source_file_name']) {
      $allowed_extensions = explode(' ', $values['feed_source_allowed_extensions']);
      $extension = pathinfo($values['feed_source_file_name'], PATHINFO_EXTENSION);
      if (!$extension || !in_array($extension, $allowed_extensions)) {
        form_set_error('feed_source_file_name', t('The file extension does not match any allowed extension.'));
      }
    }
  }

}

Delete the archive and the temporary files by the use of a custom module implementing hook_feeds_after_import(). Example for the my_module.module file:


/**
 * Implements hook_ctools_plugin_directory().
 */
function my_module_ctools_plugin_directory($owner, $plugin_type) {
  if ($owner == 'feeds'  && $plugin_type == 'plugins') {
    return "plugins/$plugin_type";
  }
}

/**
 * Implements hook_feeds_plugins().
 */
function my_module_feeds_plugins() {
  $info = array();
  $info['FeedsArchiveFileFetcher'] = array(
    'name' => 'Archive File Fetcher',
    'description' => 'Import content from a local archive.',
    'handler' => array(
      'parent' => 'FeedsFetcher',
      'class' => 'FeedsArchiveFileFetcher',
      'file' => 'FeedsArchiveFileFetcher.inc',
      'path' => drupal_get_path('module', 'sh_ads') . '/plugins',
    ),
  );
  return $info;
}

/**
 * Implements hook_feeds_after_import().
 */
function my_module_feeds_after_import($source) {
  $state_fetcher = $source->state(FEEDS_FETCH);
  if (empty($state_fetcher->processed_archives)) {
    return;
  }

  foreach ($state_fetcher->processed_archives as $archive => $extract_dir) {
    // Delete archive.
    if (is_file($archive)) {
      drupal_unlink($archive);
    }
    // Delete extracted archive content and directory.
    if (is_dir($extract_dir)) {
      // Deletes all files and directories in the specified filepath recursively.
      file_unmanaged_delete_recursive($extract_dir);
    }
  }
}

Don't forget to add files[] = plugins/FeedsArchiveFileFetcher.inc to my_module.info

Comments

dagomar’s picture

Holy moly this looks like exactly the thing I need! Danke! If I'm ever in Dresden I'll buy you a beer ;)

osopolar’s picture

Issue summary: View changes
caseyb’s picture

Many thanks for the updated notes. I think I am almost there but just getting an error with the module and Drupal 7.

So far I have done the following:
1. Created an .info file, a .module with the code supplied above in this location: /sites/default/modules/custom/FeedsArchiveFileFetcher
2. Add the additional line of code for the plugin in the .info file
3. Created a .inc file in this location: /sites/default/modules/custom/FeedsArchiveFileFetcher/plugins
4. Updated the name of the module in the module code.
5. Cleared my caches

Now I am getting an error in the modules list "This version is not compatible with Drupal 7.x and should be replaced."

My .info file contains the following information (although I have tried many variations):

name = Feeds Archive File Fetcher
description = Allows you to import zipped files.
package = Custom
core = 7.x
php = 5.3

files[] = plugins/FeedsArchiveFileFetcher.inc

Is this correct and is my file structure correct?

Thanks in advance for your help.

caseyb’s picture

I've sorted the .info file error out - it was to do with the header and had to be saved as UTF-8 without a Byte Order Mark (BOM). Saving it this way fixed the error in the module listing which displayed correctly.

Then when I tried to enable the module, I got a syntax error with the module file. This was because of the same issue as it was resolved when I resaved it.

Now, I'm having a syntax error with the plugin but resaving it with the UTF-8 format does not seem to resolve the problem.

caseyb’s picture

Issue resolved on plugin file error by replacing with a new .inc file. It is all showing correctly now and will run a few tests to see if it works okay.

Is there any way to pull the zipped file in from a url instead of uploading?

caseyb’s picture

All working now! Works perfectly for unzipping CSV files.

Any help on importing a zipped file from a url would be much appreciated.

osopolar’s picture

I guess it won't work out of the box. I derived the fetcher from FeedsFileFetcher (File upload) not from HTTP Fetcher. You could check if you can do the same that I've done using the FeedsFileFetcher extending the FeedsHTTPFetcher.

A workaround might be to just set up a cron job that downloads the file (using wget or curl) and place it in the feeds importers upload directory. The fetcher is able to grab all archives in one directory - so it shouldn't be a problem if there are multiple archives downloaded by the cron job.

caseyb’s picture

Thanks for the advice, I'll give both a try - still a very new at development so hopefully can figure one or the other out. (There are a lot of people looking for a similar solution online as this is how most affiliates send their feeds, so hope I can find a solution that will work).

Which one will have the least impact on site performance for pulling through hundreds of thousands of products?

osopolar’s picture

Issue summary: View changes

Changed function fetch() to hook some post-feching invoking feeds_after_fetch_archive_file(). In my usecase the filename contains the user-id to be needed to set the author for the feeds items.

osopolar’s picture

@#8: The only difference will be that cron job or drupal will download the archive. Not sure, if the archive is very large, there could be a timeout while downloading the file. The rest depends more on the parsers implementation (i.e. see #1213324: Parsing big xml file (250 mo /15.000 nodes)).

caseyb’s picture

Just an update to say that I have set up a cron job and it is all working perfectly now.

The fetcher is a great addition to the Feeds module!! Thanks for your quick response, useful info and help!!

dready2011’s picture

This is fantastic, and just what i needed. Thanks so much, works perfectly.
But after I updated feeds to 7.x-2.0-beta1+6-dev the button to upload the file was not rendered correctly anymore. Got this to work again by changing

$form['upload'] = array(
        '#type' => 'file',
        '#title' => empty($this->config['archive_direct']) ? t('File') : NULL,
        '#description' => empty($source_config['source']) ? t('Select an archive file from your local system.') : t('Select a different archive file from your local system.'),
        '#theme' => 'feeds_upload',
        '#file_info' => empty($source_config['fid']) ? NULL : file_load($source_config['fid']),
        '#size' => 10,
      );

to

$form['upload'] = array(
        '#type' => 'file',
        '#title' => empty($this->config['archive_direct']) ? t('File') : NULL,
        '#description' => empty($source_config['source']) ? t('Select an archive file from your local system.') : t('Select a different archive file from your local system.'),
        '#theme_wrappers' => array('feeds_upload'),
        '#file_info' => empty($source_config['fid']) ? NULL : file_load($source_config['fid']),
        '#size' => 10,
      );
MegaChriz’s picture

This feature request would fit better in a custom module.

Also closed #682102: Support archived and compressed feeds (zip, tar.gz) as duplicate.

zmove’s picture

@caseyb, do you think you could create a module (or at least a sandbox project) to allow people downloading the files and use them ?

Thanks

Alex

mxr576’s picture

Status: Active » Closed (won't fix)

A new contrib available with this long waited feature: https://www.drupal.org/project/feeds_fetcher_archive .
Implementation of this module is not compatible with the code samples above. Please test it and provide feedback in Feeds Fetcher Archive's issue queue.

Thanks, Dezső.