Coverage for src/crawler/by_source/asuo_crawler.py: 82%

51 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup 

4from ptf.model_data import create_articledata, create_contributor 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict 

8 

9 

10class AsuoCrawler(BaseCollectionCrawler): 

11 source_name = "Analele Stiintifice ale Universitatii Ovidius Constanta website" 

12 source_domain = "ASUO" 

13 source_website = "https://www.anstuocmath.ro/" 

14 

15 issue_re = r"Volume (?P<volume>\w+) \((?P<year>\d{4})\) fascicola (?P<number>[\d\w]+)" 

16 

17 def parse_collection_content(self, content): 

18 soup = BeautifulSoup(content, "html.parser") 

19 issues_tags = soup.select("ul.last-volumes li ul li a") 

20 xissues = [] 

21 for tag in issues_tags: 

22 issue_data = regex_to_dict(self.issue_re, tag.text, error_msg="Couldn't parse issue") 

23 issue_url = tag.get("href") 

24 if not isinstance(issue_url, str): 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true

25 raise ValueError("Couldn't parse issue url") 

26 issue_url = urljoin(self.collection_url, issue_url) 

27 xissue = self.create_xissue( 

28 issue_url, 

29 issue_data["year"], 

30 issue_data["volume"], 

31 issue_data["number"], 

32 ) 

33 

34 xissues.append(xissue) 

35 

36 return xissues 

37 

38 def parse_issue_content(self, content, xissue): 

39 soup = BeautifulSoup(content, "html.parser") 

40 article_tags = soup.select("#right li") 

41 if not xissue.url: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError("Issue must have an URL") 

43 for index, tag in enumerate(article_tags): 

44 xarticle = create_articledata() 

45 xarticle.pid = f"{xissue.pid}_a{index}" 

46 # Title 

47 title_tag = tag.select_one("strong") 

48 if title_tag: 

49 xarticle.title_tex = title_tag.text 

50 

51 # If title is not found, then it means the author field is used to show a title. 

52 # Authors 

53 authors_str = next((el for el in tag.contents if isinstance(el, str)), None) 

54 if authors_str: 54 ↛ 68line 54 didn't jump to line 68 because the condition on line 54 was always true

55 for a_str in authors_str.replace(" and ", ", ").split(", "): 

56 xarticle.contributors.append( 

57 create_contributor(string_name=a_str, role="author") 

58 ) 

59 

60 elif isinstance(tag.contents[0], str): 60 ↛ 63line 60 didn't jump to line 63 because the condition on line 60 was always true

61 xarticle.title_tex = tag.contents[0] 

62 else: 

63 print(f"Couldn't find article article title in {xissue.url}") 

64 continue 

65 raise ValueError("Couldn't find article title") 

66 

67 # PDF 

68 link_tag = tag.select_one("a[href$='.pdf']") 

69 if not link_tag: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 print(f"Couldn't find pdf link for {xissue.url}") 

71 continue 

72 pdf_url = link_tag.get("href") 

73 if not isinstance(pdf_url, str): 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 raise ValueError("Couldn't parse article url") 

75 add_pdf_link_to_xarticle(xarticle, urljoin(xissue.url, pdf_url)) 

76 

77 xissue.articles.append(xarticle)