Coverage for src/crawler/by_source/asuo_crawler.py: 81%

55 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup 

5from ptf.model_data import create_articledata, create_contributor 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import add_pdf_link_to_xarticle 

9 

10 

11class AsuoCrawler(BaseCollectionCrawler): 

12 source_name = "Analele Stiintifice ale Universitatii Ovidius Constanta website" 

13 source_domain = "ASUO" 

14 source_website = "https://www.anstuocmath.ro/" 

15 

16 issue_re = r"Volume (?P<volume>\w+) \((?P<year>\d{4})\) fascicola (?P<number>[\d\w]+)" 

17 

18 def parse_collection_content(self, content): 

19 soup = BeautifulSoup(content, "html.parser") 

20 issues_tags = soup.select("ul.last-volumes li ul li a") 

21 xissues = [] 

22 for tag in issues_tags: 

23 issue_search = regex.search(self.issue_re, tag.text) 

24 if not issue_search: 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true

25 raise ValueError("Couldn't parse issue") 

26 issue_data = issue_search.groupdict() 

27 issue_url = tag.get("href") 

28 if not isinstance(issue_url, str): 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true

29 raise ValueError("Couldn't parse issue url") 

30 issue_url = urljoin(self.collection_url, issue_url) 

31 xissue = self.create_xissue( 

32 issue_url, 

33 issue_data["year"], 

34 issue_data["volume"], 

35 issue_data["number"], 

36 ) 

37 

38 xissues.append(xissue) 

39 

40 return xissues 

41 

42 def parse_issue_content(self, content, xissue): 

43 soup = BeautifulSoup(content, "html.parser") 

44 article_tags = soup.select("#right li") 

45 if not xissue.url: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError("Issue must have an URL") 

47 for index, tag in enumerate(article_tags): 

48 xarticle = create_articledata() 

49 xarticle.pid = f"{xissue.pid}_a{index}" 

50 # Title 

51 title_tag = tag.select_one("strong") 

52 if title_tag: 

53 xarticle.title_tex = title_tag.text 

54 

55 # If title is not found, then it means the author field is used to show a title. 

56 # Authors 

57 authors_str = next((el for el in tag.contents if isinstance(el, str)), None) 

58 if authors_str: 58 ↛ 72line 58 didn't jump to line 72 because the condition on line 58 was always true

59 for a_str in authors_str.replace(" and ", ", ").split(", "): 

60 xarticle.contributors.append( 

61 create_contributor(string_name=a_str, role="author") 

62 ) 

63 

64 elif isinstance(tag.contents[0], str): 64 ↛ 67line 64 didn't jump to line 67 because the condition on line 64 was always true

65 xarticle.title_tex = tag.contents[0] 

66 else: 

67 print(f"Couldn't find article article title in {xissue.url}") 

68 continue 

69 raise ValueError("Couldn't find article title") 

70 

71 # PDF 

72 link_tag = tag.select_one("a[href$='.pdf']") 

73 if not link_tag: 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 print(f"Couldn't find pdf link for {xissue.url}") 

75 continue 

76 pdf_url = link_tag.get("href") 

77 if not isinstance(pdf_url, str): 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 raise ValueError("Couldn't parse article url") 

79 add_pdf_link_to_xarticle(xarticle, urljoin(xissue.url, pdf_url)) 

80 

81 xissue.articles.append(xarticle)