[LARAVEL] wordpress rss content scraping - fourslickz/notes GitHub Wiki
<?php
namespace App\Console\Commands;
use Illuminate\Console\Command;
use GuzzleHttp\Client as GuzzleClient;
use DOMDocument;
use DOMXPath;
class GetRssRainas extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'app:get-rss-rainas';
/**
* The console command description.
*
* @var string
*/
protected $description = 'Get RSS Rainas from http://rainas12.pramuka.or.id/feed';
/**
* Create a new command instance.
*
* @return void
*/
public function __construct()
{
parent::__construct();
}
/**
* Execute the console command.
*
* @return mixed
*/
public function handle()
{
// Replace this with the URL of the WordPress RSS feed you want to fetch
$feed_url = 'http://rainas12.pramuka.or.id/feed?paged=2';
// Fetch the RSS feed content
$rss_content = file_get_contents($feed_url);
if ($rss_content !== false) {
// Parse the RSS content using SimpleXML
$rss = simplexml_load_string($rss_content);
// Check if the feed was parsed successfully
if ($rss !== false) {
// Loop through the items in the feed and access the content
foreach ($rss->channel->item as $item) {
// Access various elements of the RSS item
$title = (string)$item->title;
$link = (string)$item->link;
$description = (string)$item->description;
$pubDate = (string)$item->pubDate;
// Do whatever you want with the retrieved data
$this->info("Title: $title");
$this->info("Link: $link");
$this->info("Description: $description");
$this->info("Published Date: $pubDate");
$this->info("");
$this->scrapUrl($link);
}
} else {
$this->info("Failed to parse the RSS feed.");
}
} else {
$this->info("Failed to fetch the RSS feed.");
}
}
private function scrapUrl($url)
{
$httpClient = new GuzzleClient();
$response = $httpClient->get($url);
$htmlString = (string) $response->getBody();
// add this line to suppress any warnings
libxml_use_internal_errors(true);
$doc = new DOMDocument();
$doc->loadHTML($htmlString);
$xpath = new DOMXPath($doc);
// contents
$titleXPath = $xpath->evaluate('//h1[@class="is-title post-title"]');
$descriptionXPath = $xpath->evaluate('//div[@class="post-content cf entry-content content-spacious"]');
$imageXPath = $xpath->evaluate('//img[@class="attachment-large size-large lazyload wp-post-image"]');
$title = $titleXPath->length > 0 ? $titleXPath->item(0)->nodeValue : null;
$description = $descriptionXPath->length > 0 ? $descriptionXPath->item(0)->nodeValue : null;
$image = $imageXPath->length > 0 ? $imageXPath->item(0)->getAttribute('data-src') : null;
$data = [
'url' => $url,
'title' => $title,
'description' => $description,
'image' => $image,
];
echo json_encode($data, JSON_PRETTY_PRINT);
echo "\n";
}
}