[LARAVEL] wordpress rss content scraping - fourslickz/notes GitHub Wiki

<?php

namespace App\Console\Commands;

use Illuminate\Console\Command;
use GuzzleHttp\Client as GuzzleClient;
use DOMDocument;
use DOMXPath;

class GetRssRainas extends Command
{
    /**
     * The name and signature of the console command.
     *
     * @var string
     */
    protected $signature = 'app:get-rss-rainas';

    /**
     * The console command description.
     *
     * @var string
     */
    protected $description = 'Get RSS Rainas from http://rainas12.pramuka.or.id/feed';

    /**
     * Create a new command instance.
     *
     * @return void
     */
    public function __construct()
    {
        parent::__construct();
    }

    /**
     * Execute the console command.
     *
     * @return mixed
     */
    public function handle()
    {
        // Replace this with the URL of the WordPress RSS feed you want to fetch
        $feed_url = 'http://rainas12.pramuka.or.id/feed?paged=2';
        
        // Fetch the RSS feed content
        $rss_content = file_get_contents($feed_url);
        
        if ($rss_content !== false) {
            // Parse the RSS content using SimpleXML
            $rss = simplexml_load_string($rss_content);
        
            // Check if the feed was parsed successfully
            if ($rss !== false) {
                // Loop through the items in the feed and access the content
                foreach ($rss->channel->item as $item) {
                    // Access various elements of the RSS item
                    $title = (string)$item->title;
                    $link = (string)$item->link;
                    $description = (string)$item->description;
                    $pubDate = (string)$item->pubDate;
        
                    // Do whatever you want with the retrieved data
                    $this->info("Title: $title");
                    $this->info("Link: $link");
                    $this->info("Description: $description");
                    $this->info("Published Date: $pubDate");
                    $this->info("");
                    $this->scrapUrl($link);
                }
            } else {
                $this->info("Failed to parse the RSS feed.");
            }
        } else {
            $this->info("Failed to fetch the RSS feed.");
        }
        
    }

    private function scrapUrl($url)
    {
        $httpClient = new GuzzleClient();
        $response = $httpClient->get($url);
        $htmlString = (string) $response->getBody();

        // add this line to suppress any warnings
        libxml_use_internal_errors(true);

        $doc = new DOMDocument();
        $doc->loadHTML($htmlString);
        $xpath = new DOMXPath($doc);

        // contents
        $titleXPath = $xpath->evaluate('//h1[@class="is-title post-title"]');
        $descriptionXPath = $xpath->evaluate('//div[@class="post-content cf entry-content content-spacious"]');
        $imageXPath = $xpath->evaluate('//img[@class="attachment-large size-large lazyload wp-post-image"]');

        $title = $titleXPath->length > 0 ? $titleXPath->item(0)->nodeValue : null;
        $description = $descriptionXPath->length > 0 ? $descriptionXPath->item(0)->nodeValue : null;
        $image = $imageXPath->length > 0 ? $imageXPath->item(0)->getAttribute('data-src') : null;

        $data = [
            'url'   => $url,
            'title' => $title,
            'description' => $description,
            'image' => $image,
        ];

        echo json_encode($data, JSON_PRETTY_PRINT);
        echo "\n";
    }
}