Coverage for src/crawler/by_source/asuo

1from urllib.parse import urljoin

3import regex

4from bs4 import BeautifulSoup

5from ptf.model_data import create_articledata, create_contributor

7from crawler.base_crawler import BaseCollectionCrawler

8from crawler.utils import add_pdf_link_to_xarticle

11class AsuoCrawler(BaseCollectionCrawler):

12 source_name = "Analele Stiintifice ale Universitatii Ovidius Constanta website"

13 source_domain = "ASUO"

14 source_website = "https://www.anstuocmath.ro/"

16 issue_re = r"Volume (?P<volume>\w+) $(?P<year>\d{4})$ fascicola (?P<number>[\d\w]+)"

18 def parse_collection_content(self, content):

19 soup = BeautifulSoup(content, "html.parser")

20 issues_tags = soup.select("ul.last-volumes li ul li a")

21 xissues = []

22 for tag in issues_tags:

23 issue_search = regex.search(self.issue_re, tag.text)

24 if not issue_search: 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true

25 raise ValueError("Couldn't parse issue")

26 issue_data = issue_search.groupdict()

27 issue_url = tag.get("href")

28 if not isinstance(issue_url, str): 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true

29 raise ValueError("Couldn't parse issue url")

30 issue_url = urljoin(self.collection_url, issue_url)

31 xissue = self.create_xissue(

32 issue_url,

33 issue_data["year"],

34 issue_data["volume"],

35 issue_data["number"],

36 )

38 xissues.append(xissue)

40 return xissues

42 def parse_issue_content(self, content, xissue):

43 soup = BeautifulSoup(content, "html.parser")

44 article_tags = soup.select("#right li")

45 if not xissue.url: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError("Issue must have an URL")

47 for index, tag in enumerate(article_tags):

48 xarticle = create_articledata()

49 xarticle.pid = f"{xissue.pid}_a{index}"

50 # Title

51 title_tag = tag.select_one("strong")

52 if title_tag:

53 xarticle.title_tex = title_tag.text

55 # If title is not found, then it means the author field is used to show a title.

56 # Authors

57 authors_str = next((el for el in tag.contents if isinstance(el, str)), None)

58 if authors_str: 58 ↛ 72line 58 didn't jump to line 72 because the condition on line 58 was always true

59 for a_str in authors_str.replace(" and ", ", ").split(", "):

60 xarticle.contributors.append(

61 create_contributor(string_name=a_str, role="author")

62 )

64 elif isinstance(tag.contents[0], str): 64 ↛ 67line 64 didn't jump to line 67 because the condition on line 64 was always true

65 xarticle.title_tex = tag.contents[0]

66 else:

67 print(f"Couldn't find article article title in {xissue.url}")

68 continue

69 raise ValueError("Couldn't find article title")

71 # PDF

72 link_tag = tag.select_one("a[href$='.pdf']")

73 if not link_tag: 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 print(f"Couldn't find pdf link for {xissue.url}")

75 continue

76 pdf_url = link_tag.get("href")

77 if not isinstance(pdf_url, str): 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 raise ValueError("Couldn't parse article url")

79 add_pdf_link_to_xarticle(xarticle, urljoin(xissue.url, pdf_url))

81 xissue.articles.append(xarticle)

Coverage for src/crawler/by_source/asuo_crawler.py: 81%

55 statements