mirror of
https://github.com/bellingcat/snscrape.git
synced 2026-06-12 12:28:28 +03:00
Merge pull request #518 from hgrsd/fix/vkontakte-photo-scrape
fix(vkontakte): update photo detection
This commit is contained in:
@@ -117,6 +117,9 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
|||||||
return urllib.parse.unquote(a['href'][13 : end])
|
return urllib.parse.unquote(a['href'][13 : end])
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def is_photo(self, a):
|
||||||
|
return 'aria-label' in a.attrs and a.attrs['aria-label'].startswith('photo')
|
||||||
|
|
||||||
def _date_span_to_date(self, dateSpan):
|
def _date_span_to_date(self, dateSpan):
|
||||||
if not dateSpan:
|
if not dateSpan:
|
||||||
return None
|
return None
|
||||||
@@ -172,7 +175,7 @@ class VKontakteUserScraper(snscrape.base.Scraper):
|
|||||||
not (not isCopy and thumbsDiv.parent.name == 'div' and 'class' in thumbsDiv.parent.attrs and 'copy_quote' in thumbsDiv.parent.attrs['class']): # Skip post quotes
|
not (not isCopy and thumbsDiv.parent.name == 'div' and 'class' in thumbsDiv.parent.attrs and 'copy_quote' in thumbsDiv.parent.attrs['class']): # Skip post quotes
|
||||||
photos = []
|
photos = []
|
||||||
for a in thumbsDiv.find_all('a', class_ = 'page_post_thumb_wrap'):
|
for a in thumbsDiv.find_all('a', class_ = 'page_post_thumb_wrap'):
|
||||||
if 'data-photo-id' not in a.attrs and 'data-video' not in a.attrs:
|
if not self.is_photo(a) and 'data-video' not in a.attrs:
|
||||||
_logger.warning(f'Skipping non-photo and non-video thumb wrap on {url}')
|
_logger.warning(f'Skipping non-photo and non-video thumb wrap on {url}')
|
||||||
continue
|
continue
|
||||||
if 'data-video' in a.attrs:
|
if 'data-video' in a.attrs:
|
||||||
|
|||||||
Reference in New Issue
Block a user