Files
cisticola/docs/build/html/cisticola.scraper.html
2022-02-21 17:52:38 +01:00

303 lines
24 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />
<title>cisticola.scraper package &#8212; Cisticola documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="_static/alabaster.css" />
<script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
<script src="_static/jquery.js"></script>
<script src="_static/underscore.js"></script>
<script src="_static/doctools.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="cisticola.transformer package" href="cisticola.transformer.html" />
<link rel="prev" title="cisticola package" href="cisticola.html" />
<link rel="stylesheet" href="_static/custom.css" type="text/css" />
<meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
</head><body>
<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">
<div class="body" role="main">
<section id="cisticola-scraper-package">
<h1>cisticola.scraper package<a class="headerlink" href="#cisticola-scraper-package" title="Permalink to this headline"></a></h1>
<section id="submodules">
<h2>Submodules<a class="headerlink" href="#submodules" title="Permalink to this headline"></a></h2>
</section>
<section id="module-cisticola.scraper.bitchute">
<span id="cisticola-scraper-bitchute-module"></span><h2>cisticola.scraper.bitchute module<a class="headerlink" href="#module-cisticola.scraper.bitchute" title="Permalink to this headline"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="cisticola.scraper.bitchute.BitchuteScraper">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">cisticola.scraper.bitchute.</span></span><span class="sig-name descname"><span class="pre">BitchuteScraper</span></span><a class="headerlink" href="#cisticola.scraper.bitchute.BitchuteScraper" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#cisticola.scraper.Scraper" title="cisticola.scraper.Scraper"><code class="xref py py-class docutils literal notranslate"><span class="pre">cisticola.scraper.Scraper</span></code></a></p>
<p>An implementation of a Scraper for Bitchute, using classes from the 4cat
library</p>
<dl class="py method">
<dt class="sig sig-object py" id="cisticola.scraper.bitchute.BitchuteScraper.can_handle">
<span class="sig-name descname"><span class="pre">can_handle</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">channel</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#cisticola.scraper.bitchute.BitchuteScraper.can_handle" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="cisticola.scraper.bitchute.BitchuteScraper.get_posts">
<span class="sig-name descname"><span class="pre">get_posts</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">channel</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="cisticola.html#cisticola.base.Channel" title="cisticola.base.Channel"><span class="pre">cisticola.base.Channel</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">since</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="cisticola.html#cisticola.base.ScraperResult" title="cisticola.base.ScraperResult"><span class="pre">cisticola.base.ScraperResult</span></a><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="cisticola.html#cisticola.base.ScraperResult" title="cisticola.base.ScraperResult"><span class="pre">cisticola.base.ScraperResult</span></a><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#cisticola.scraper.bitchute.BitchuteScraper.get_posts" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="cisticola.scraper.bitchute.BitchuteScraper.get_username_from_url">
<span class="sig-name descname"><span class="pre">get_username_from_url</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#cisticola.scraper.bitchute.BitchuteScraper.get_username_from_url" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="cisticola.scraper.bitchute.append_details">
<span class="sig-prename descclassname"><span class="pre">cisticola.scraper.bitchute.</span></span><span class="sig-name descname"><span class="pre">append_details</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">video</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">detail</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#cisticola.scraper.bitchute.append_details" title="Permalink to this definition"></a></dt>
<dd><p>Append extra metadata to video data</p>
<p>Fetches the BitChute video detail page to scrape extra data for the given video.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>video</strong> (<em>dict</em>) Video details as scraped so far</p></li>
<li><p><strong>detail</strong> (<em>str</em>) Detail level. If comments, also scrape video comments.</p></li>
</ul>
</dd>
<dt class="field-even">Return dict</dt>
<dd class="field-even"><p>Tuple, first item: updated video data, second: list of comments</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="cisticola.scraper.bitchute.get_about">
<span class="sig-prename descclassname"><span class="pre">cisticola.scraper.bitchute.</span></span><span class="sig-name descname"><span class="pre">get_about</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">user</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#cisticola.scraper.bitchute.get_about" title="Permalink to this definition"></a></dt>
<dd><p>Extract fields from channels “About” tab</p>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="cisticola.scraper.bitchute.get_videos_user">
<span class="sig-prename descclassname"><span class="pre">cisticola.scraper.bitchute.</span></span><span class="sig-name descname"><span class="pre">get_videos_user</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">session</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">user</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">csrftoken</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">detail</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#cisticola.scraper.bitchute.get_videos_user" title="Permalink to this definition"></a></dt>
<dd><p>Scrape videos for given BitChute user</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>session</strong> HTTP Session to use</p></li>
<li><p><strong>user</strong> (<em>str</em>) Username to scrape videos for</p></li>
<li><p><strong>csrftoken</strong> (<em>str</em>) CSRF token to use for requests</p></li>
<li><p><strong>detail</strong> (<em>str</em>) Detail level to scrape, basic/detail/comments</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Video data dictionaries, as a generator</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="cisticola.scraper.bitchute.request_from_bitchute">
<span class="sig-prename descclassname"><span class="pre">cisticola.scraper.bitchute.</span></span><span class="sig-name descname"><span class="pre">request_from_bitchute</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">session</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">method</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">headers</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#cisticola.scraper.bitchute.request_from_bitchute" title="Permalink to this definition"></a></dt>
<dd><p>Request something via the BitChute API (or non-API)</p>
<p>To avoid having to write the same error-checking everywhere, this takes
care of retrying on failure, et cetera</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>session</strong> Requests session</p></li>
<li><p><strong>method</strong> (<em>str</em>) GET or POST</p></li>
<li><p><strong>url</strong> (<em>str</em>) URL to fetch</p></li>
<li><p><strong>header</strong> (<em>dict</em>) Headers to pass with the request</p></li>
<li><p><strong>data</strong> (<em>dict</em>) Data/params to send with the request</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Requests response</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="cisticola.scraper.bitchute.strip_tags">
<span class="sig-prename descclassname"><span class="pre">cisticola.scraper.bitchute.</span></span><span class="sig-name descname"><span class="pre">strip_tags</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">html</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">convert_newlines</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#cisticola.scraper.bitchute.strip_tags" title="Permalink to this definition"></a></dt>
<dd><blockquote>
<div><p>Strip HTML from a string</p>
<dl class="field-list simple">
<dt class="field-odd">param html</dt>
<dd class="field-odd"><p>HTML to strip</p>
</dd>
<dt class="field-even">param convert_newlines</dt>
<dd class="field-even"><p>Convert &lt;br&gt; and &lt;/p&gt; tags to</p>
</dd>
</dl>
</div></blockquote>
<dl class="simple">
<dt>before stripping</dt><dd><dl class="field-list simple">
<dt class="field-odd">return</dt>
<dd class="field-odd"><p>Stripped HTML</p>
</dd>
</dl>
</dd>
</dl>
</dd></dl>
</section>
<section id="module-cisticola.scraper.gettr">
<span id="cisticola-scraper-gettr-module"></span><h2>cisticola.scraper.gettr module<a class="headerlink" href="#module-cisticola.scraper.gettr" title="Permalink to this headline"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="cisticola.scraper.gettr.GettrScraper">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">cisticola.scraper.gettr.</span></span><span class="sig-name descname"><span class="pre">GettrScraper</span></span><a class="headerlink" href="#cisticola.scraper.gettr.GettrScraper" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#cisticola.scraper.Scraper" title="cisticola.scraper.Scraper"><code class="xref py py-class docutils literal notranslate"><span class="pre">cisticola.scraper.Scraper</span></code></a></p>
<p>An implementation of a Scraper for Gettr, using gogettr library</p>
<dl class="py method">
<dt class="sig sig-object py" id="cisticola.scraper.gettr.GettrScraper.can_handle">
<span class="sig-name descname"><span class="pre">can_handle</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">channel</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#cisticola.scraper.gettr.GettrScraper.can_handle" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="cisticola.scraper.gettr.GettrScraper.get_posts">
<span class="sig-name descname"><span class="pre">get_posts</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">channel</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="cisticola.html#cisticola.base.Channel" title="cisticola.base.Channel"><span class="pre">cisticola.base.Channel</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">since</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="cisticola.html#cisticola.base.ScraperResult" title="cisticola.base.ScraperResult"><span class="pre">cisticola.base.ScraperResult</span></a><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="cisticola.html#cisticola.base.ScraperResult" title="cisticola.base.ScraperResult"><span class="pre">cisticola.base.ScraperResult</span></a><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#cisticola.scraper.gettr.GettrScraper.get_posts" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="cisticola.scraper.gettr.GettrScraper.get_username_from_url">
<span class="sig-name descname"><span class="pre">get_username_from_url</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#cisticola.scraper.gettr.GettrScraper.get_username_from_url" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
</section>
<section id="module-cisticola.scraper.twitter">
<span id="cisticola-scraper-twitter-module"></span><h2>cisticola.scraper.twitter module<a class="headerlink" href="#module-cisticola.scraper.twitter" title="Permalink to this headline"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="cisticola.scraper.twitter.TwitterScraper">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">cisticola.scraper.twitter.</span></span><span class="sig-name descname"><span class="pre">TwitterScraper</span></span><a class="headerlink" href="#cisticola.scraper.twitter.TwitterScraper" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#cisticola.scraper.Scraper" title="cisticola.scraper.Scraper"><code class="xref py py-class docutils literal notranslate"><span class="pre">cisticola.scraper.Scraper</span></code></a></p>
<p>An implementation of a Scraper for Twitter, using snscrape library</p>
<dl class="py method">
<dt class="sig sig-object py" id="cisticola.scraper.twitter.TwitterScraper.can_handle">
<span class="sig-name descname"><span class="pre">can_handle</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">channel</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#cisticola.scraper.twitter.TwitterScraper.can_handle" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="cisticola.scraper.twitter.TwitterScraper.get_posts">
<span class="sig-name descname"><span class="pre">get_posts</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">channel</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="cisticola.html#cisticola.base.Channel" title="cisticola.base.Channel"><span class="pre">cisticola.base.Channel</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">since</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="cisticola.html#cisticola.base.ScraperResult" title="cisticola.base.ScraperResult"><span class="pre">cisticola.base.ScraperResult</span></a><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="cisticola.html#cisticola.base.ScraperResult" title="cisticola.base.ScraperResult"><span class="pre">cisticola.base.ScraperResult</span></a><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#cisticola.scraper.twitter.TwitterScraper.get_posts" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="cisticola.scraper.twitter.TwitterScraper.get_username_from_url">
<span class="sig-name descname"><span class="pre">get_username_from_url</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#cisticola.scraper.twitter.TwitterScraper.get_username_from_url" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
</section>
<section id="module-cisticola.scraper">
<span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-cisticola.scraper" title="Permalink to this headline"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="cisticola.scraper.Scraper">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">cisticola.scraper.</span></span><span class="sig-name descname"><span class="pre">Scraper</span></span><a class="headerlink" href="#cisticola.scraper.Scraper" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<dl class="py method">
<dt class="sig sig-object py" id="cisticola.scraper.Scraper.can_handle">
<span class="sig-name descname"><span class="pre">can_handle</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">channel</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="cisticola.html#cisticola.base.Channel" title="cisticola.base.Channel"><span class="pre">cisticola.base.Channel</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="headerlink" href="#cisticola.scraper.Scraper.can_handle" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="cisticola.scraper.Scraper.get_posts">
<span class="sig-name descname"><span class="pre">get_posts</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">channel</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="cisticola.html#cisticola.base.Channel" title="cisticola.base.Channel"><span class="pre">cisticola.base.Channel</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">since</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="cisticola.html#cisticola.base.ScraperResult" title="cisticola.base.ScraperResult"><span class="pre">cisticola.base.ScraperResult</span></a><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="cisticola.html#cisticola.base.ScraperResult" title="cisticola.base.ScraperResult"><span class="pre">cisticola.base.ScraperResult</span></a><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#cisticola.scraper.Scraper.get_posts" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
</section>
</section>
</div>
</div>
</div>
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
<div class="sphinxsidebarwrapper">
<h1 class="logo"><a href="index.html">Cisticola</a></h1>
<h3>Navigation</h3>
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1 current"><a class="reference internal" href="modules.html">cisticola</a><ul class="current">
<li class="toctree-l2 current"><a class="reference internal" href="cisticola.html">cisticola package</a></li>
</ul>
</li>
</ul>
<div class="relations">
<h3>Related Topics</h3>
<ul>
<li><a href="index.html">Documentation overview</a><ul>
<li><a href="modules.html">cisticola</a><ul>
<li><a href="cisticola.html">cisticola package</a><ul>
<li>Previous: <a href="cisticola.html" title="previous chapter">cisticola package</a></li>
<li>Next: <a href="cisticola.transformer.html" title="next chapter">cisticola.transformer package</a></li>
</ul></li>
</ul></li>
</ul></li>
</ul>
</div>
<div id="searchbox" style="display: none" role="search">
<h3 id="searchlabel">Quick search</h3>
<div class="searchformwrapper">
<form class="search" action="search.html" method="get">
<input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
<input type="submit" value="Go" />
</form>
</div>
</div>
<script>$('#searchbox').show(0);</script>
</div>
</div>
<div class="clearer"></div>
</div>
<div class="footer">
&copy;2022, Bellingcat.
|
Powered by <a href="http://sphinx-doc.org/">Sphinx 4.4.0</a>
&amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.12</a>
|
<a href="_sources/cisticola.scraper.rst.txt"
rel="nofollow">Page source</a>
</div>
</body>
</html>