Coverage for src/crawler/by_source/aulfm_crawler.py: 84%
71 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup, Tag
5from ptf.model_data import ArticleData, create_abstract, create_articledata, create_subj
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
11class AulfmCrawler(BaseCollectionCrawler):
12 source_name = "University of Lodz Repository"
13 source_domain = "AULFM"
14 source_website = "https://dspace.uni.lodz.pl/xmlui/"
16 issue_re = r".+ vol. (?P<volume>\d+)\/(?P<year>\d+)"
17 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)"
19 def parse_collection_content(self, content):
20 xissues = []
21 soup = BeautifulSoup(content, "html.parser")
22 issues = soup.select("h4.artifact-title a span.Z3988")
23 for issue in issues:
24 issue_search = regex.search(self.issue_re, issue.text)
25 if not issue_search: 25 ↛ 26line 25 didn't jump to line 26 because the condition on line 25 was never true
26 raise ValueError("Couldn't parse issue data")
27 issue_dict = issue_search.groupdict()
29 parent = issue.parent.parent
30 a_tag = issue.parent
31 issue_href = a_tag.get("href")
32 if not isinstance(issue_href, str): 32 ↛ 33line 32 didn't jump to line 33 because the condition on line 32 was never true
33 raise ValueError("Couldn't parse issue url")
35 a_tag.decompose()
36 article_count = cleanup_str(parent.text).removeprefix("[").removesuffix("]")
37 if article_count == "0":
38 continue
40 xissues.append(
41 self.create_xissue(
42 urljoin(self.collection_url, issue_href),
43 issue_dict["year"],
44 issue_dict["volume"],
45 None,
46 )
47 )
48 return xissues
50 def parse_issue_content(self, content, xissue):
51 soup = BeautifulSoup(content, "html.parser")
52 articles = soup.select("h4.artifact-title a")
53 for index, article_tag in enumerate(articles):
54 article_url = article_tag.get("href")
55 if not isinstance(article_url, str): 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true
56 raise ValueError("Couldn't parse article data")
57 xarticle = create_articledata()
58 xarticle.pid = "a" + str(index)
59 xarticle.url = urljoin(self.collection_url, article_url)
60 xissue.articles.append(xarticle)
62 def parse_article_content(self, content, xissue, xarticle, url):
63 soup = BeautifulSoup(content, "html.parser")
64 self.get_metadata_using_citation_meta(
65 xarticle, xissue, soup, ["title", "publisher", "lang", "author"]
66 )
68 pdf_link_tag = soup.select(
69 ".item-page-field-wrapper > div > a[href^='/xmlui/bitstream/handle']"
70 )
71 if len(pdf_link_tag) != 1: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 raise ValueError("Error while trying to parse pdf url : found multiple <a> candidates")
73 pdf_link = pdf_link_tag[0].get("href")
74 if not isinstance(pdf_link, str): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 raise ValueError("Couldn't parse article pdf")
76 add_pdf_link_to_xarticle(xarticle, urljoin(url, pdf_link))
78 self.get_metadata_using_dcterms(soup, xarticle, ("abstract", "keywords"))
80 return xarticle
82 def get_metadata_using_dcterms(self, soup: Tag, xarticle: ArticleData, what):
83 if "abstract" in what: 83 ↛ 93line 83 didn't jump to line 93 because the condition on line 83 was always true
84 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']")
85 if abstract_tag: 85 ↛ 93line 85 didn't jump to line 93 because the condition on line 85 was always true
86 abstract_text = abstract_tag.get("content")
87 if isinstance(abstract_text, str): 87 ↛ 93line 87 didn't jump to line 93 because the condition on line 87 was always true
88 xabstract = create_abstract(
89 lang="en", tag="abstract", value_tex=cleanup_str(abstract_text)
90 )
91 xarticle.abstracts.append(xabstract)
93 if "keywords" in what: 93 ↛ exitline 93 didn't return from function 'get_metadata_using_dcterms' because the condition on line 93 was always true
94 keyword_tags = soup.select("meta[name='DC.subject']")
95 for tag in keyword_tags:
96 kwd_text = tag.get("content")
97 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true
98 continue
99 kwd = create_subj(value=kwd_text)
100 xarticle.kwds.append(kwd)