Coverage for src/crawler/by_source/asuo_crawler.py: 81%
55 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup
5from ptf.model_data import create_articledata, create_contributor
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.utils import add_pdf_link_to_xarticle
11class AsuoCrawler(BaseCollectionCrawler):
12 source_name = "Analele Stiintifice ale Universitatii Ovidius Constanta website"
13 source_domain = "ASUO"
14 source_website = "https://www.anstuocmath.ro/"
16 issue_re = r"Volume (?P<volume>\w+) \((?P<year>\d{4})\) fascicola (?P<number>[\d\w]+)"
18 def parse_collection_content(self, content):
19 soup = BeautifulSoup(content, "html.parser")
20 issues_tags = soup.select("ul.last-volumes li ul li a")
21 xissues = []
22 for tag in issues_tags:
23 issue_search = regex.search(self.issue_re, tag.text)
24 if not issue_search: 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true
25 raise ValueError("Couldn't parse issue")
26 issue_data = issue_search.groupdict()
27 issue_url = tag.get("href")
28 if not isinstance(issue_url, str): 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true
29 raise ValueError("Couldn't parse issue url")
30 issue_url = urljoin(self.collection_url, issue_url)
31 xissue = self.create_xissue(
32 issue_url,
33 issue_data["year"],
34 issue_data["volume"],
35 issue_data["number"],
36 )
38 xissues.append(xissue)
40 return xissues
42 def parse_issue_content(self, content, xissue):
43 soup = BeautifulSoup(content, "html.parser")
44 article_tags = soup.select("#right li")
45 if not xissue.url: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise ValueError("Issue must have an URL")
47 for index, tag in enumerate(article_tags):
48 xarticle = create_articledata()
49 xarticle.pid = f"{xissue.pid}_a{index}"
50 # Title
51 title_tag = tag.select_one("strong")
52 if title_tag:
53 xarticle.title_tex = title_tag.text
55 # If title is not found, then it means the author field is used to show a title.
56 # Authors
57 authors_str = next((el for el in tag.contents if isinstance(el, str)), None)
58 if authors_str: 58 ↛ 72line 58 didn't jump to line 72 because the condition on line 58 was always true
59 for a_str in authors_str.replace(" and ", ", ").split(", "):
60 xarticle.contributors.append(
61 create_contributor(string_name=a_str, role="author")
62 )
64 elif isinstance(tag.contents[0], str): 64 ↛ 67line 64 didn't jump to line 67 because the condition on line 64 was always true
65 xarticle.title_tex = tag.contents[0]
66 else:
67 print(f"Couldn't find article article title in {xissue.url}")
68 continue
69 raise ValueError("Couldn't find article title")
71 # PDF
72 link_tag = tag.select_one("a[href$='.pdf']")
73 if not link_tag: 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true
74 print(f"Couldn't find pdf link for {xissue.url}")
75 continue
76 pdf_url = link_tag.get("href")
77 if not isinstance(pdf_url, str): 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 raise ValueError("Couldn't parse article url")
79 add_pdf_link_to_xarticle(xarticle, urljoin(xissue.url, pdf_url))
81 xissue.articles.append(xarticle)