UP | HOME

KrISS php5

This rss class is developped for KrISS feed but is useful for parsing rss/atom feed or xml file.

Table of Contents

1 Features

  • Only depends on libxml2 library (no SimpleXML or other)
  • Xml validity is performed by libxml2
  • Parse rdf:RDF, rss, atom feeds
  • Let you customize what information you want with simple syntax
  • Created for rss/atom feed but useful for common xml format

2 How to use

2.1 demo.php

This demo page lets you get RSS/Atom items.

<?php 
    include 'Rss.php';
    if (isset($_POST['url'])) {
        $data = file_get_contents(
            filter_var($_POST['url'], FILTER_VALIDATE_URL)
        );
        $rss = Rss::loadDom($data);
        if (empty($rss['error'])) {
            $dom = $rss['dom'];
            $max = -1;
            if (!empty($_POST['max'])) {
                $max = (int) $_POST['max'];
            }
        } else {
            $error = $rss['error'];
        }
    }
?>
<!DOCTYPE html>
<html>
  <head>
    <title>Rss</title>
    <meta charset="utf-8">
  </head>
  <body>
    <form method="post" action="">
      Url : <input type="text" name="url"><br>
      Max items : <input type="text" name="max"><br>
      <input type="submit" value="Submit">
    </form>
    <?php
        if (!empty($error)) {
            echo $error;
        }
        if (isset($dom)) {
            $feed = Rss::getFeed($dom);
            $items = Rss::getItems($dom, $max);
    ?>
<a
  href="<?php echo htmlspecialchars($feed['htmlUrl']); ?>"
  title="<?php echo htmlspecialchars($feed['description']); ?>"
>
<?php echo htmlspecialchars($feed['title']); ?>
</a> :
<ol>
<?php
  foreach($items as $item) {
?>
    <li>
      <a href="<?php echo htmlspecialchars($item['link']); ?>"><?php echo htmlspecialchars($item['title']); ?></a><br>
      <em><?php echo utf8_encode(strftime('%A %d %B %Y - %H:%M', strtotime($item['time']))); ?></em>
      by <strong><?php echo htmlspecialchars($item['author']); ?></strong><br>
      <?php echo substr(htmlspecialchars(strip_tags($item['content'])), 0, 100).'...'; ?>
    </li>
<?php
  }
?>
</ol>
    <?php
        }
    ?>
  </body>
</html>

2.2 Rss.php

The Rss class is available on github : https://raw.github.com/tontof/kriss_php5/master/Rss.php

<?php
/**
 * Rss class
 *
 * Features:
 * - Only depends on libxml2 library (no SimpleXML)
 * - Xml validity is performed by libxml2
 * - Parse rdf:RDF, rss, atom feeds
 * - Let you customize what information you want with simple syntax
 * - Created for rss/atom feed but useful for common xml format
 * 
 * How to use:
 * - http://tontof.net/kriss/php5/rss
 */
class Rss
{
    const UNKNOWN = 0;
    const RSS = 1;
    const ATOM = 2;

    public static $feedFormat = array(
       'title' => array('>title'),
       'description' => array('>description', '>subtitle'),
       'htmlUrl' => array('>link', '>link[rel=self][href]', '>link[href]', '>id')
    );

    public static $itemFormat = array(
        'author' => array('>author>name', '>author', '>dc:creator', 'feed>author>name', '>dc:author', '>creator'),
        'content' => array('>content:encoded', '>content', '>description', '>summary', '>subtitle'),
        'description' => array('>description', '>summary', '>subtitle', '>content', '>content:encoded'),
        'via' => array('>guid', '>id'),
        'link' => array('>feedburner:origLink', '>link[rel=alternate][href]', '>link[href]', '>link', '>guid', '>id'),
        'time' => array('>pubDate', '>updated', '>lastBuildDate', '>published', '>dc:date', '>date', '>created', '>modified'),
        'title' => array('>title')
    );

    /**
     * Check for a list of attributes if current node is valid
     *
     * @param DOMNode $node  to check if valid
     * @param array   $attrs to test if in $node
     *
     * @return boolean true if $node is valid for $attrs, false otherwise
     */
    public static function isValidNodeAttrs($node, $attrs)
    {
        foreach ($attrs as $attr) {
            if (strpos($attr, '=') !== false) {
                list($attr, $val) = explode('=', $attr);
            }
            if (!$node->hasAttribute($attr)
                || (!empty($val) && $node->getAttribute($attr) !== $val)) {
                return false;
            }
        }

        return true;
    }

    /**
     * Check if tagName from $nodes are correct depending on $name
     *
     * @param DOMNodeList $nodes to check
     * @param string      $name  to compare with tagName
     *
     * @return array of nodes with correct $name
     */
    public static function filterNodeListByName($nodes, $name)
    {
        $res = array();

        for ($i = 0; $i < $nodes->length; $i++) {
            if ($nodes->item($i)->tagName === $name) {
                $res[] = $nodes->item($i);
            }
        }

        return $res;
    }

    /**
     * Return array of descendant DOMNode of $node with tagName equals to $name
     *
     * @param DOMNode $node to starts with
     * @param string  $name of descendant
     *
     * @return array of descendant DOMNode with tagName equals to $name
     */
    public static function getNodesName($node, $name)
    {
        if (strpos($name, ':') !== false) {
            list(, $localname) = explode(':', $name);
            $nodes = $node->getElementsByTagNameNS('*', $localname);
        } else {
            $nodes = $node->getElementsByTagName($name);
        }

        return self::filterNodeListByName($nodes, $name);
    }

    /**
     * Return content of $node depending on defined $selectors
     *
     * @param DOMNode $node      to starts with
     * @param array   $selectors defined using node>node[attr=val][attr]
     *
     * @return string of the desired selection or empty string if not found
     */
    public static function getElement($node, $selectors)
    {
        $res = '';
        $selector = array_shift($selectors);
        $attributes = explode('[', trim($selector, ']'));
        $name = array_shift($attributes);
        if (substr($name, -1) == "*") {
            $name = substr($name, 0, -1);
            $res = array();
        }

        $nodes = self::getNodesName($node, $name);
        foreach ($nodes as $currentNode) {
            if ($currentNode->parentNode->isSameNode($node)
                && self::isValidNodeAttrs($currentNode, $attributes)) {
                if (empty($selectors)) {
                    $attr = end($attributes);
                    if (empty($attr) || strpos($attr, '=') !== false) {
                        if (is_array($res)) {
                            $res[] = $currentNode->textContent;
                        } else {
                            $res = $currentNode->textContent;
                        }
                    } else {
                        if (is_array($res)) {
                            $res[] = $currentNode->getAttribute($attr);
                        } else {
                            $res = $currentNode->getAttribute($attr);
                        }
                    }
                } else {
                    return self::getElement($currentNode, $selectors);
                }
            }
            if (!is_array($res) && !empty($res)) {
                break;
            }
        }

        return $res;
    }

    /**
     * Format $element depending on $formats
     *
     * @param DOMDocument $dom     of document
     * @param DOMNode     $element to starts with
     * @param array       $formats to use to extract information
     *
     * @return array of extracted information
     */
    public static function formatElement($dom, $element, $formats)
    {
        $newElement = array();
        foreach ($formats as $format => $list) {
            $newElement[$format] = '';
            for ($i = 0, $len = count($list);
                 $i < $len && empty($newElement[$format]);
                 $i++) {
                $selectors = explode('>', $list[$i]);
                $selector = array_shift($selectors);
                if (empty($selector)) {
                    $newElement[$format] = self::getElement($element, $selectors);
                } else if (strpos($selector, '[') === 0) {
                    $attributes = explode('[', trim($selector, ']'));
                    if (self::isValidNodeAttrs($element, $attributes)) {
                        $newElement[$format] = self::getElement($element, $selectors);
                    }
                } else {
                    $attributes = explode('[', trim($selector, ']'));
                    $name = array_shift($attributes);
                    $nodes = self::getNodesName($dom, $name);
                    foreach ($nodes as $node) {
                        if (self::isValidNodeAttrs($node, $attributes)) {
                            $newElement[$format] = self::getElement($node, $selectors);
                        }
                        if (!empty($newElement[$format])) {
                            break;
                        }
                    }
                }
            }
        }

        return $newElement;
    }

    /**
     * Return array of feed from a DOMDocument
     *
     * @param DOMDocument $dom
     *
     * @return array of feed info extracted from $dom
     */
    public static function getFeed($dom)
    {
        $feed = new DOMNodelist;

        $type = self::getType($dom);
        if ($type === self::RSS) {
            $feed = $dom->getElementsByTagName('channel')->item(0);
        } elseif ($type === self::ATOM) {
            $feed = $dom->getElementsByTagName('feed')->item(0);
        }

        return self::formatElement($dom, $feed, self::$feedFormat);
    }

    /**
     * Return array of items from a DOMDocument
     *
     * @param DOMDocument $dom
     * @param integer     $nb of items to select
     *
     * @return array of items extracted from the $dom
     */
    public static function getItems($dom, $nb = -1)
    {
        $items = new DOMNodelist;

        $type = self::getType($dom);
        if ($type === self::RSS) {
            $items = $dom->getElementsByTagName('item');
        } elseif ($type === self::ATOM) {
            $items = $dom->getElementsByTagName('entry');
        }

        $newItems = array();
        $max = $nb === -1 ? $items->length : max($nb, $item->length);
        for ($i = 0; $i < $max; $i++) {
            $newItems[] = self::formatElement($dom, $items->item($i), self::$itemFormat);
        }

        return $newItems;
    }

    /**
     * Return type of a DOMDocument
     *
     * @param DOMDocument $dom
     *
     * @return const corresponding to the type of $dom
     */
    public static function getType($dom)
    {
        $type = self::UNKNOWN;

        $feed = $dom->getElementsByTagName('channel');
        if ($feed->item(0)) {
            $type = self::RSS;
        } else {
            $feed = $dom->getElementsByTagName('feed');
            if ($feed->item(0)) {
                $type = self::ATOM;
            }
        }

        return $type;
    }

    /**
     * Load a XML string into DOMDocument
     *
     * @param string $data
     *
     * @return array with a DOMDocument and a string error
     */
    public static function loadDom($data)
    {
        libxml_clear_errors();
        set_error_handler(array('Rss', 'silenceErrors'));
        $dom = new DOMDocument();
        $dom->loadXML($data);
        restore_error_handler();

        return array(
            'dom' => $dom,
            'error' => self::getError(libxml_get_last_error())
        );
    }

    /**
     * Explicit libxml2 error
     *
     * @param LibXMLError $error
     *
     * @return string of the error
     */
    public static function getError($error)
    {
        $return = '';

        if ($error !== false) {
            switch ($error->level) {
            case LIBXML_ERR_WARNING:
                $return = "Warning XML $error->code: ";
                break;
            case LIBXML_ERR_ERROR:
                $return = "Error XML $error->code: ";
                break;
            case LIBXML_ERR_FATAL:
                $return = "Fatal Error XML $error->code: ";
                break;
            }
            $return .= $return.trim($error->message);
        }

        return $return;
    }

    /**
     * From Simplie Pie
     *
     * @param integer $num of errno
     * @param string  $str of errstr
     */
    public static function silenceErrors($num, $str)
    {
        // No-op                                                       
    }
}

3 Advanced options

By default, Rss class lets you parse most of rss/atom feeds. However you may want to use custom information by modifying $feedFormat and $itemFormat. The order of rules also defines the priority. The first non-empty element corresponding is selected and the other rules are not checked.

3.1 Child selection >

Using > lets you capture different elements. If the rule begins with an empty selector, that means it will search inside feed or item element.

With author format, the rule >author>name try to capture the name element defined inside the author element defined inside item element. while the rule feed>author>name try to capture the name element defined inside the author element defined inside feed element defined somewhere in the dom.

3.2 Attribute selection [attribute]

By default, the content of tag is selected. You may want to select an attribute of element using [attribute] selection.

With htmlUrl format, the rule >link[href] select the value of attribute href inside a link element defined inside feed element.

3.3 Conditionnal selection [attribute=value]

When parsing elements, you may want to test some attributes.

With htmlUrl format, the rule >link[rel=self][href] try to capture the href attribute inside the link element that contains attribute rel with self value.

3.4 Star selection and customization

Using Rss class, you may add different rules and select several elements inside feed.

For example, you may want to get enclosure : http://foz.home.xs4all.nl/mod_enclosure.html

Let use this example : http://www.rssboard.org/files/example-multiple-enclosures.xml

If you try on demo page, the author is missing.

As you can see in xml author is defined with itunes:author tag defined in channel element.

In order to add this rule to Rss class, you just need to add a rule to item author:

Rss::$itemFormat['author'][] = 'channel>itunes:author';

In order to get the first enclosure:

Rss::$itemFormat['enclosure'] = array('>enclosure[url]');

But you may want all enclosures by adding a star:

Rss::$itemFormat['enclosure'] = array('>enclosure*[url]');

3.5 Advanced demo

You can try the advanced demo:

<?php 
    include 'Rss.php';

    Rss::$itemFormat['author'][] = 'channel>itunes:author';
    Rss::$itemFormat['enclosure'] = array('>enclosure*[url]');

    $data = file_get_contents(
        'http://www.rssboard.org/files/example-multiple-enclosures.xml'
    );
    $rss = Rss::loadDom($data);
    if (empty($rss['error'])) {
        $dom = $rss['dom'];
    } else {
        $error = $rss['error'];
    }

?>
<!DOCTYPE html>
<html>
  <head>
    <title>Rss</title>
    <meta charset="utf-8">
  </head>
  <body>
<?php
if (!empty($error)) {
    die($error);
}
if (isset($dom)) {
    $feed = Rss::getFeed($dom);
    $items = Rss::getItems($dom);
?>
<a
  href="<?php echo htmlspecialchars($feed['htmlUrl']); ?>"
  title="<?php echo htmlspecialchars($feed['description']); ?>"
>
<?php echo htmlspecialchars($feed['title']); ?>
</a> :
<ol>
<?php foreach($items as $item) { ?>
    <li>
      <a href="<?php echo htmlspecialchars($item['link']); ?>"><?php echo htmlspecialchars($item['title']); ?></a><br>
      <em><?php echo utf8_encode(strftime('%A %d %B %Y - %H:%M', strtotime($item['time']))); ?></em>
      by <strong><?php echo htmlspecialchars($item['author']); ?></strong><br>
      <?php echo substr(htmlspecialchars(strip_tags($item['content'])), 0, 100).'...'; ?><br>
      enclosures:<br>
      <?php foreach($item['enclosure'] as $enclosure) { ?>
      <a href="<?php echo htmlspecialchars($enclosure); ?>">
        <?php echo htmlspecialchars($enclosure); ?>
      </a><br>
      <?php } ?>
    </li>
<?php } ?>
</ol>
<?php
}
?>
  </body>
</html>

4 See alternatives

5 More infos about RSS/Atom

Date: 2013-08-31 12:04:20 CEST