Coverage for src/crawler/by_source/ams/ams_jams_crawler.py: 28%

19 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-07-07 11:48 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup 

4from ptf.model_data import create_articledata 

5 

6from crawler.by_source.ams.ams_base_crawler import AmsCrawler 

7 

8 

9class Ams_jamsCrawler(AmsCrawler): 

10 source_domain = "AMS_JAMS" 

11 

12 def parse_issue_content(self, content, xissue): 

13 if not xissue.url: 

14 raise ValueError("xissue url is not set") 

15 soup = BeautifulSoup(content, "html.parser") 

16 articles = soup.select("article.contentList > dl > dt > a[href]") 

17 for index, a in enumerate(articles): 

18 article_url = a.get("href") 

19 if not isinstance(article_url, str): 

20 raise ValueError("Couldn't parse article url") 

21 xarticle = create_articledata() 

22 xarticle.url = urljoin(xissue.url, article_url) 

23 xarticle.pid = "a" + str(index) 

24 xissue.articles.append(xarticle)