Coverage for src/crawler/by_source/asuo_crawler.py: 82%
51 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
1from urllib.parse import urljoin
3from bs4 import BeautifulSoup
4from ptf.model_data import create_articledata, create_contributor
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict
10class AsuoCrawler(BaseCollectionCrawler):
11 source_name = "Analele Stiintifice ale Universitatii Ovidius Constanta website"
12 source_domain = "ASUO"
13 source_website = "https://www.anstuocmath.ro/"
15 issue_re = r"Volume (?P<volume>\w+) \((?P<year>\d{4})\) fascicola (?P<number>[\d\w]+)"
17 def parse_collection_content(self, content):
18 soup = BeautifulSoup(content, "html.parser")
19 issues_tags = soup.select("ul.last-volumes li ul li a")
20 xissues = []
21 for tag in issues_tags:
22 issue_data = regex_to_dict(self.issue_re, tag.text, error_msg="Couldn't parse issue")
23 issue_url = tag.get("href")
24 if not isinstance(issue_url, str): 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true
25 raise ValueError("Couldn't parse issue url")
26 issue_url = urljoin(self.collection_url, issue_url)
27 xissue = self.create_xissue(
28 issue_url,
29 issue_data["year"],
30 issue_data["volume"],
31 issue_data["number"],
32 )
34 xissues.append(xissue)
36 return xissues
38 def parse_issue_content(self, content, xissue):
39 soup = BeautifulSoup(content, "html.parser")
40 article_tags = soup.select("#right li")
41 if not xissue.url: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 raise ValueError("Issue must have an URL")
43 for index, tag in enumerate(article_tags):
44 xarticle = create_articledata()
45 xarticle.pid = f"{xissue.pid}_a{index}"
46 # Title
47 title_tag = tag.select_one("strong")
48 if title_tag:
49 xarticle.title_tex = title_tag.text
51 # If title is not found, then it means the author field is used to show a title.
52 # Authors
53 authors_str = next((el for el in tag.contents if isinstance(el, str)), None)
54 if authors_str: 54 ↛ 68line 54 didn't jump to line 68 because the condition on line 54 was always true
55 for a_str in authors_str.replace(" and ", ", ").split(", "):
56 xarticle.contributors.append(
57 create_contributor(string_name=a_str, role="author")
58 )
60 elif isinstance(tag.contents[0], str): 60 ↛ 63line 60 didn't jump to line 63 because the condition on line 60 was always true
61 xarticle.title_tex = tag.contents[0]
62 else:
63 print(f"Couldn't find article article title in {xissue.url}")
64 continue
65 raise ValueError("Couldn't find article title")
67 # PDF
68 link_tag = tag.select_one("a[href$='.pdf']")
69 if not link_tag: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 print(f"Couldn't find pdf link for {xissue.url}")
71 continue
72 pdf_url = link_tag.get("href")
73 if not isinstance(pdf_url, str): 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true
74 raise ValueError("Couldn't parse article url")
75 add_pdf_link_to_xarticle(xarticle, urljoin(xissue.url, pdf_url))
77 xissue.articles.append(xarticle)