forked from Wavyzz/dolibarr
648 lines
21 KiB
PHP
Executable File
648 lines
21 KiB
PHP
Executable File
<?php
|
|
/* Copyright (C) 2011 Laurent Destailleur <eldy@users.sourceforge.net>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
/**
|
|
* \file htdocs/core/class/rssparser.class.php
|
|
* \ingroup core
|
|
* \brief File of class to parse rss feeds
|
|
*/
|
|
class RssParser
|
|
{
|
|
var $db;
|
|
var $error;
|
|
|
|
protected $_format='';
|
|
protected $_urlRSS;
|
|
protected $_language;
|
|
protected $_generator;
|
|
protected $_copyright;
|
|
protected $_lastbuilddate;
|
|
protected $_imageurl;
|
|
protected $_link;
|
|
protected $_title;
|
|
protected $_description;
|
|
protected $_lastfetchdate; // Last successful fetch
|
|
protected $_rssarray=array();
|
|
|
|
// Accessors
|
|
public function getFormat() { return $this->_format; }
|
|
public function getUrlRss() { return $this->_urlRSS; }
|
|
public function getLanguage() { return $this->_language; }
|
|
public function getGenerator() { return $this->_generator; }
|
|
public function getCopyright() { return $this->_copyright; }
|
|
public function getLastBuildDate() { return $this->_lastbuilddate; }
|
|
public function getImageUrl() { return $this->_imageurl; }
|
|
public function getLink() { return $this->_link; }
|
|
public function getTitle() { return $this->_title; }
|
|
public function getDescription() { return $this->_description; }
|
|
public function getLastFetchDate() { return $this->_lastfetchdate; }
|
|
public function getItems() { return $this->_rssarray; }
|
|
|
|
|
|
// For parsing with xmlparser
|
|
var $stack = array(); // parser stack
|
|
var $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright');
|
|
|
|
|
|
/**
|
|
* Constructor
|
|
*/
|
|
public function RssParser($db)
|
|
{
|
|
$this->db=$db;
|
|
}
|
|
|
|
|
|
/**
|
|
* Parse rss URL
|
|
*
|
|
* @param urlRSS Url to parse
|
|
* @param maxNb Max nb of records to get (0 for no limit)
|
|
* @param cachedelay 0=No cache, nb of seconds we accept cache files (cachedir must also be defined)
|
|
* @param cachedir Directory where to save cache file
|
|
* @return int <0 if KO, >0 if OK
|
|
*/
|
|
public function parser($urlRSS, $maxNb=0, $cachedelay=60, $cachedir='')
|
|
{
|
|
include_once(DOL_DOCUMENT_ROOT.'/lib/files.lib.php');
|
|
|
|
$str=''; // This will contain content of feed
|
|
|
|
// Check parameters
|
|
if (! dol_is_url($urlRSS))
|
|
{
|
|
$this->error="ErrorBadUrl";
|
|
return -1;
|
|
}
|
|
|
|
$this->_urlRSS = $urlRSS;
|
|
$newpathofdestfile=$cachedir.'/'.dol_hash($this->_urlRSS);
|
|
$newmask='0644';
|
|
|
|
//dol_syslog("RssPArser::parser parse url=".$urlRSS." => cache file=".$newpathofdestfile);
|
|
$nowgmt = dol_now();
|
|
|
|
// Search into cache
|
|
$foundintocache=0;
|
|
if ($cachedelay > 0 && $cachedir)
|
|
{
|
|
$filedate=dol_filemtime($newpathofdestfile);
|
|
if ($filedate >= ($nowgmt - $cachedelay))
|
|
{
|
|
//dol_syslog("RssParser::parser cache file ".$newpathofdestfile." is not older than now - cachedelay (".$nowgmt." - ".$cachedelay.") so we use it.");
|
|
$foundintocache=1;
|
|
|
|
$this->_lastfetchdate=$filedate;
|
|
}
|
|
else
|
|
{
|
|
dol_syslog("RssParser::parser cache file ".$newpathofdestfile." is not found or older than now - cachedelay (".$nowgmt." - ".$cachedelay.") so we can't use it.");
|
|
}
|
|
}
|
|
|
|
// Load file into $str
|
|
if ($foundintocache) // Cache file found and is not too old
|
|
{
|
|
$str = file_get_contents($newpathofdestfile);
|
|
}
|
|
else
|
|
{
|
|
try {
|
|
ini_set("user_agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)");
|
|
ini_set("max_execution_time", 10);
|
|
$str = file_get_contents($this->_urlRSS);
|
|
}
|
|
catch (Exception $e) {
|
|
print 'Error retrieving URL '.$this->urlRSS.' - '.$e->getMessage();
|
|
}
|
|
}
|
|
|
|
// Convert $str into xml
|
|
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
|
|
{
|
|
//print 'xx'.LIBXML_NOCDATA;
|
|
libxml_use_internal_errors(false);
|
|
$rss = simplexml_load_string($str, "SimpleXMLElement", LIBXML_NOCDATA);
|
|
}
|
|
else
|
|
{
|
|
$xmlparser=xml_parser_create('');
|
|
if (!is_resource($xmlparser)) { $this->error="ErrorFailedToCreateParser"; return -1; }
|
|
|
|
xml_set_object( $xmlparser, $this );
|
|
xml_set_element_handler($xmlparser, 'feed_start_element', 'feed_end_element' );
|
|
xml_set_character_data_handler( $xmlparser, 'feed_cdata' );
|
|
$status = xml_parse( $xmlparser, $str );
|
|
xml_parser_free( $xmlparser );
|
|
$rss=$this;
|
|
//var_dump($rss->_format);exit;
|
|
}
|
|
|
|
// If $rss loaded
|
|
if ($rss)
|
|
{
|
|
// Save file into cache
|
|
if (empty($foundintocache) && $cachedir)
|
|
{
|
|
dol_syslog("RssParser::parser cache file ".$newpathofdestfile." is saved onto disk.");
|
|
if (! dol_is_dir($cachedir)) dol_mkdir($cachedir);
|
|
$fp = fopen($newpathofdestfile, 'w');
|
|
fwrite($fp, $str);
|
|
fclose($fp);
|
|
if (! empty($conf->global->MAIN_UMASK)) $newmask=$conf->global->MAIN_UMASK;
|
|
@chmod($newpathofdestfile, octdec($newmask));
|
|
|
|
$this->_lastfetchdate=$nowgmt;
|
|
}
|
|
|
|
unset($str); // Free memory
|
|
|
|
if (empty($rss->_format)) // If format not detected automatically
|
|
{
|
|
$rss->_format='rss';
|
|
if (empty($rss->channel)) $rss->_format='atom';
|
|
}
|
|
|
|
$items=array();
|
|
|
|
// Save description entries
|
|
if ($rss->_format == 'rss')
|
|
{
|
|
//var_dump($rss);
|
|
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
|
|
{
|
|
if (!empty($rss->channel->language)) $this->_language = (string) $rss->channel->language;
|
|
if (!empty($rss->channel->generator)) $this->_generator = (string) $rss->channel->generator;
|
|
if (!empty($rss->channel->copyright)) $this->_copyright = (string) $rss->channel->copyright;
|
|
if (!empty($rss->channel->lastbuilddate)) $this->_lastbuilddate = (string) $rss->channel->lastbuilddate;
|
|
if (!empty($rss->channel->image->url[0])) $this->_imageurl = (string) $rss->channel->image->url[0];
|
|
if (!empty($rss->channel->link)) $this->_link = (string) $rss->channel->link;
|
|
if (!empty($rss->channel->title)) $this->_title = (string) $rss->channel->title;
|
|
if (!empty($rss->channel->description)) $this->_description = (string) $rss->channel->description;
|
|
}
|
|
else
|
|
{
|
|
//var_dump($rss->channel);
|
|
if (!empty($rss->channel['language'])) $this->_language = (string) $rss->channel['language'];
|
|
if (!empty($rss->channel['generator'])) $this->_generator = (string) $rss->channel['generator'];
|
|
if (!empty($rss->channel['copyright'])) $this->_copyright = (string) $rss->channel['copyright'];
|
|
if (!empty($rss->channel['lastbuilddate'])) $this->_lastbuilddate = (string) $rss->channel['lastbuilddate'];
|
|
if (!empty($rss->image['url'])) $this->_imageurl = (string) $rss->image['url'];
|
|
if (!empty($rss->channel['link'])) $this->_link = (string) $rss->channel['link'];
|
|
if (!empty($rss->channel['title'])) $this->_title = (string) $rss->channel['title'];
|
|
if (!empty($rss->channel['description'])) $this->_description = (string) $rss->channel['description'];
|
|
}
|
|
|
|
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) $items=$rss->channel->item; // With simplexml
|
|
else $items=$rss->items; // With xmlparse
|
|
//var_dump($items);exit;
|
|
}
|
|
else if ($rss->_format == 'atom')
|
|
{
|
|
//var_dump($rss);
|
|
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
|
|
{
|
|
if (!empty($rss->generator)) $this->_generator = (string) $rss->generator;
|
|
if (!empty($rss->lastbuilddate)) $this->_lastbuilddate = (string) $rss->modified;
|
|
if (!empty($rss->link->href)) $this->_link = (string) $rss->link->href;
|
|
if (!empty($rss->title)) $this->_title = (string) $rss->title;
|
|
if (!empty($rss->description)) $this->_description = (string) $rss->description;
|
|
}
|
|
else
|
|
{
|
|
//if (!empty($rss->channel['rss_language'])) $this->_language = (string) $rss->channel['rss_language'];
|
|
if (!empty($rss->channel['generator'])) $this->_generator = (string) $rss->channel['generator'];
|
|
//if (!empty($rss->channel['rss_copyright'])) $this->_copyright = (string) $rss->channel['rss_copyright'];
|
|
if (!empty($rss->channel['modified'])) $this->_lastbuilddate = (string) $rss->channel['modified'];
|
|
//if (!empty($rss->image['rss_url'])) $this->_imageurl = (string) $rss->image['rss_url'];
|
|
if (!empty($rss->channel['link'])) $this->_link = (string) $rss->channel['link'];
|
|
if (!empty($rss->channel['title'])) $this->_title = (string) $rss->channel['title'];
|
|
//if (!empty($rss->channel['rss_description'])) $this->_description = (string) $rss->channel['rss_description'];
|
|
}
|
|
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) { $tmprss=xml2php($rss); $items=$tmprss['entry'];} // With simplexml
|
|
else $items=$rss->items; // With xmlparse
|
|
//var_dump($items);exit;
|
|
}
|
|
|
|
$i = 0;
|
|
// Loop on each record
|
|
foreach($items as $item)
|
|
{
|
|
//var_dump($item);exit;
|
|
if ($rss->_format == 'rss')
|
|
{
|
|
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
|
|
{
|
|
$itemLink = (string) $item->link;
|
|
$itemTitle = (string) $item->title;
|
|
$itemDescription = (string) $item->description;
|
|
$itemPubDate = (string) $item->pubDate;
|
|
$itemId = '';
|
|
$itemAuthor = '';
|
|
}
|
|
else
|
|
{
|
|
$itemLink = (string) $item['link'];
|
|
$itemTitle = (string) $item['title'];
|
|
$itemDescription = (string) $item['description'];
|
|
$itemPubDate = (string) $item['pubdate'];
|
|
$itemId = (string) $item['guid'];
|
|
$itemAuthor = (string) $item['author'];
|
|
}
|
|
|
|
// Loop on each category
|
|
$itemCategory=array();
|
|
if (is_array($item->category))
|
|
{
|
|
foreach ($item->category as $cat)
|
|
{
|
|
$itemCategory[] = (string) $cat;
|
|
}
|
|
}
|
|
}
|
|
else if ($rss->_format == 'atom')
|
|
{
|
|
if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
|
|
{
|
|
$itemLink = (string) $item['link']['href'];
|
|
$itemTitle = (string) $item['title'];
|
|
$itemDescription = (string) $item['summary'];
|
|
$itemPubDate = (string) $item['created'];
|
|
$itemId = (string) $item['id'];
|
|
$itemAuthor = (string) ($item['author']?$item['author']:$item['author_name']);
|
|
}
|
|
else
|
|
{
|
|
$itemLink = (string) $item['link']['href'];
|
|
$itemTitle = (string) $item['title'];
|
|
$itemDescription = (string) $item['summary'];
|
|
$itemPubDate = (string) $item['created'];
|
|
$itemId = (string) $item['id'];
|
|
$itemAuthor = (string) ($item['author']?$item['author']:$item['author_name']);
|
|
}
|
|
}
|
|
else print 'ErrorBadFeedFormat';
|
|
|
|
// Add record to result array
|
|
$this->_rssarray[$i] = array(
|
|
'link'=>$itemLink,
|
|
'title'=>$itemTitle,
|
|
'description'=>$itemDescription,
|
|
'pubDate'=>$itemPubDate,
|
|
'category'=>$itemCategory,
|
|
'id'=>$itemId,
|
|
'author'=>$itemAuthor);
|
|
//var_dump($this->_rssarray);
|
|
|
|
$i++;
|
|
|
|
if ($i > $maxNb) break; // We get all records we want
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
else
|
|
{
|
|
$this->error='ErrorFailedToLoadRSSFile';
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* Triggered when opened tag is found
|
|
*
|
|
* @param $p
|
|
* @param $element Tag
|
|
* @param $attrs Attributes of tags
|
|
*/
|
|
function feed_start_element($p, $element, &$attrs)
|
|
{
|
|
$el = $element = strtolower($element);
|
|
$attrs = array_change_key_case($attrs, CASE_LOWER);
|
|
|
|
// check for a namespace, and split if found
|
|
$ns = false;
|
|
if ( strpos( $element, ':' ) ) {
|
|
list($ns, $el) = explode( ':', $element, 2);
|
|
}
|
|
if ( $ns and $ns != 'rdf' ) {
|
|
$this->current_namespace = $ns;
|
|
}
|
|
|
|
// if feed type isn't set, then this is first element of feed identify feed from root element
|
|
if (empty($this->_format))
|
|
{
|
|
if ( $el == 'rdf' ) {
|
|
$this->_format = 'rss';
|
|
$this->feed_version = '1.0';
|
|
}
|
|
elseif ( $el == 'rss' ) {
|
|
$this->_format = 'rss';
|
|
$this->feed_version = $attrs['version'];
|
|
}
|
|
elseif ( $el == 'feed' ) {
|
|
$this->_format = 'atom';
|
|
$this->feed_version = $attrs['version'];
|
|
$this->inchannel = true;
|
|
}
|
|
return;
|
|
}
|
|
|
|
if ( $el == 'channel' )
|
|
{
|
|
$this->inchannel = true;
|
|
}
|
|
elseif ($el == 'item' or $el == 'entry' )
|
|
{
|
|
$this->initem = true;
|
|
if ( isset($attrs['rdf:about']) ) {
|
|
$this->current_item['about'] = $attrs['rdf:about'];
|
|
}
|
|
}
|
|
|
|
// if we're in the default namespace of an RSS feed,
|
|
// record textinput or image fields
|
|
elseif (
|
|
$this->_format == 'rss' and
|
|
$this->current_namespace == '' and
|
|
$el == 'textinput' )
|
|
{
|
|
$this->intextinput = true;
|
|
}
|
|
|
|
elseif (
|
|
$this->_format == 'rss' and
|
|
$this->current_namespace == '' and
|
|
$el == 'image' )
|
|
{
|
|
$this->inimage = true;
|
|
}
|
|
|
|
# handle atom content constructs
|
|
elseif ( $this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) )
|
|
{
|
|
// avoid clashing w/ RSS mod_content
|
|
if ($el == 'content' ) {
|
|
$el = 'atom_content';
|
|
}
|
|
|
|
$this->incontent = $el;
|
|
|
|
|
|
}
|
|
|
|
// if inside an Atom content construct (e.g. content or summary) field treat tags as text
|
|
elseif ($this->_format == 'atom' and $this->incontent )
|
|
{
|
|
// if tags are inlined, then flatten
|
|
$attrs_str = join(' ',
|
|
array_map('map_attrs',
|
|
array_keys($attrs),
|
|
array_values($attrs) ) );
|
|
|
|
$this->append_content( "<$element $attrs_str>" );
|
|
|
|
array_unshift( $this->stack, $el );
|
|
}
|
|
|
|
// Atom support many links per containging element.
|
|
// Magpie treats link elements of type rel='alternate'
|
|
// as being equivalent to RSS's simple link element.
|
|
//
|
|
elseif ($this->_format == 'atom' and $el == 'link' )
|
|
{
|
|
if ( isset($attrs['rel']) and $attrs['rel'] == 'alternate' )
|
|
{
|
|
$link_el = 'link';
|
|
}
|
|
else {
|
|
$link_el = 'link_' . $attrs['rel'];
|
|
}
|
|
|
|
$this->append($link_el, $attrs['href']);
|
|
}
|
|
// set stack[0] to current element
|
|
else {
|
|
array_unshift($this->stack, $el);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* Triggered when CDATA is found
|
|
*
|
|
* @param $p
|
|
* @param $text Tag
|
|
*/
|
|
function feed_cdata($p, $text)
|
|
{
|
|
if ($this->_format == 'atom' and $this->incontent)
|
|
{
|
|
$this->append_content( $text );
|
|
}
|
|
else {
|
|
$current_el = join('_', array_reverse($this->stack));
|
|
$this->append($current_el, $text);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Triggered when closed tag is found
|
|
*
|
|
* @param $p
|
|
* @param $el Tag
|
|
*/
|
|
function feed_end_element($p, $el)
|
|
{
|
|
$el = strtolower($el);
|
|
|
|
if ( $el == 'item' or $el == 'entry' )
|
|
{
|
|
$this->items[] = $this->current_item;
|
|
$this->current_item = array();
|
|
$this->initem = false;
|
|
}
|
|
elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'textinput' )
|
|
{
|
|
$this->intextinput = false;
|
|
}
|
|
elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'image' )
|
|
{
|
|
$this->inimage = false;
|
|
}
|
|
elseif ($this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) )
|
|
{
|
|
$this->incontent = false;
|
|
}
|
|
elseif ($el == 'channel' or $el == 'feed' )
|
|
{
|
|
$this->inchannel = false;
|
|
}
|
|
elseif ($this->_format == 'atom' and $this->incontent ) {
|
|
// balance tags properly
|
|
// note: i don't think this is actually neccessary
|
|
if ( $this->stack[0] == $el )
|
|
{
|
|
$this->append_content("</$el>");
|
|
}
|
|
else {
|
|
$this->append_content("<$el />");
|
|
}
|
|
|
|
array_shift( $this->stack );
|
|
}
|
|
else {
|
|
array_shift( $this->stack );
|
|
}
|
|
|
|
$this->current_namespace = false;
|
|
}
|
|
|
|
|
|
/**
|
|
* To concat 2 string with no warning if an operand is not defined
|
|
*
|
|
* @param $str1
|
|
* @param $str2
|
|
*/
|
|
function concat(&$str1, $str2="")
|
|
{
|
|
if (!isset($str1) ) {
|
|
$str1="";
|
|
}
|
|
$str1 .= $str2;
|
|
}
|
|
|
|
/**
|
|
*/
|
|
function append_content($text)
|
|
{
|
|
if ( $this->initem ) {
|
|
$this->concat( $this->current_item[ $this->incontent ], $text );
|
|
}
|
|
elseif ( $this->inchannel ) {
|
|
$this->concat( $this->channel[ $this->incontent ], $text );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* smart append - field and namespace aware
|
|
*/
|
|
function append($el, $text)
|
|
{
|
|
if (!$el) {
|
|
return;
|
|
}
|
|
if ( $this->current_namespace )
|
|
{
|
|
if ( $this->initem ) {
|
|
$this->concat(
|
|
$this->current_item[ $this->current_namespace ][ $el ], $text);
|
|
}
|
|
elseif ($this->inchannel) {
|
|
$this->concat(
|
|
$this->channel[ $this->current_namespace][ $el ], $text );
|
|
}
|
|
elseif ($this->intextinput) {
|
|
$this->concat(
|
|
$this->textinput[ $this->current_namespace][ $el ], $text );
|
|
}
|
|
elseif ($this->inimage) {
|
|
$this->concat(
|
|
$this->image[ $this->current_namespace ][ $el ], $text );
|
|
}
|
|
}
|
|
else {
|
|
if ( $this->initem ) {
|
|
$this->concat(
|
|
$this->current_item[ $el ], $text);
|
|
}
|
|
elseif ($this->intextinput) {
|
|
$this->concat(
|
|
$this->textinput[ $el ], $text );
|
|
}
|
|
elseif ($this->inimage) {
|
|
$this->concat(
|
|
$this->image[ $el ], $text );
|
|
}
|
|
elseif ($this->inchannel) {
|
|
$this->concat(
|
|
$this->channel[ $el ], $text );
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
/**
|
|
* Function to convert an XML object into an array
|
|
*/
|
|
function xml2php($xml)
|
|
{
|
|
$fils = 0;
|
|
$tab = false;
|
|
$array = array();
|
|
foreach($xml->children() as $key => $value)
|
|
{
|
|
$child = xml2php($value);
|
|
|
|
//To deal with the attributes
|
|
foreach($value->attributes() as $ak=>$av)
|
|
{
|
|
$child[$ak] = (string)$av;
|
|
|
|
}
|
|
|
|
//Let see if the new child is not in the array
|
|
if($tab==false && in_array($key,array_keys($array)))
|
|
{
|
|
//If this element is already in the array we will create an indexed array
|
|
$tmp = $array[$key];
|
|
$array[$key] = NULL;
|
|
$array[$key][] = $tmp;
|
|
$array[$key][] = $child;
|
|
$tab = true;
|
|
}
|
|
elseif($tab == true)
|
|
{
|
|
//Add an element in an existing array
|
|
$array[$key][] = $child;
|
|
}
|
|
else
|
|
{
|
|
//Add a simple element
|
|
$array[$key] = $child;
|
|
}
|
|
|
|
$fils++;
|
|
}
|
|
|
|
|
|
if($fils==0)
|
|
{
|
|
return (string)$xml;
|
|
}
|
|
|
|
return $array;
|
|
|
|
}
|
|
|
|
?>
|