RandomLineIterator - salathe/spl-examples GitHub Wiki

RandomLineIterator

Assume we have a file of hostnames we need to iterate. Something like

proxy1.example.com
proxy2.example.com
proxy3.example.com
proxy4.example.com
proxy5.example.com
// … more entries

Further assume that we want to iterate this file without loading into memory and in random order because we are memory bound on the host machine. To save memory, we can iterate the file with SplFileObject line by line, which will then only load the currently iterated line into memory. To randomize the lines, we can use an OuterIterator that determines how many lines are in the file and randomizes the order in which they are fetched prior to an iteration

Example

<?php
$proxies = new SplFileObject('proxies.txt');
$proxies->setFlags(SplFileObject::SKIP_EMPTY | SplFileObject::DROP_NEW_LINE);
$lines = new RandomLineIterator($proxies);
foreach ($lines as $line) {
    echo $line, PHP_EOL;
}

Class Code

<?php
class RandomLineIterator implements OuterIterator
{
    /**
     * @var SplFileObject
     */
    private $fileObject;

    /**
     * @var array
     */
    private $linesToTry;

    /**
     * @param SplFileObject $fileObject
     * @return void
     */
    public function __construct(SplFileObject $fileObject)
    {
        $this->fileObject = $fileObject;
        $this->initLinesToTry();
    }

    /**
     * @return void
     */
    private function initLinesToTry()
    {
        $this->fileObject->seek(PHP_INT_MAX);
        $this->linesToTry = range(0, $this->fileObject->key());
        shuffle($this->linesToTry);
    }

    /**
     * @return mixed
     */
    public function current()
    {
        return $this->getInnerIterator()->current();
    }

    /**
     * @return void
     */
    public function next()
    {
        $this->getInnerIterator()->seek(
            next($this->linesToTry)
        );
    }

    /**
     * @return mixed
     */
    public function key()
    {
        return key($this->linesToTry);
    }

    /**
     * @return boolean
     */
    public function valid()
    {
        return current($this->linesToTry) !== false;
    }

    /**
     * @return void
     */
    public function rewind()
    {
        $this->getInnerIterator()->seek(
            reset($this->linesToTry)
        );
    }

    /**
     * @return Iterator
     */
    public function getInnerIterator()
    {
        return $this->fileObject;
    }
}

Generator version

With the introduction of the yield keyword in PHP 5.5, the above is equivalent to

<?php
function getLinesInRandomOrder($filePath)
{
    $file = new SplFileObject($filePath);
    $file->setFlags(SplFileObject::DROP_NEW_LINE | SplFileObject::SKIP_EMPTY);
    $file->seek(PHP_INT_MAX);
    $linesToTry = range(0, $file->key());
    shuffle($linesToTry);
    foreach ($linesToTry as $line) {
        $file->seek($line);
        yield $file->current();
    }
}

foreach (getLinesInRandomOrder('proxies.txt') as $line) {
    echo $line, PHP_EOL;
}
⚠️ **GitHub.com Fallback** ⚠️