Coverage for src/crawler/by_source/aulfm_crawler.py: 85%

67 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup, Tag 

4from ptf.model_data import ArticleData, create_abstract, create_articledata, create_subj 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

8 

9 

10class AulfmCrawler(BaseCollectionCrawler): 

11 source_name = "University of Lodz Repository" 

12 source_domain = "AULFM" 

13 source_website = "https://dspace.uni.lodz.pl/xmlui/" 

14 

15 issue_re = r".+ vol. (?P<volume>\d+)\/(?P<year>\d+)" 

16 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)" 

17 

18 def parse_collection_content(self, content): 

19 xissues = [] 

20 soup = BeautifulSoup(content, "html.parser") 

21 issues = soup.select("h4.artifact-title a span.Z3988") 

22 for issue in issues: 

23 issue_dict = regex_to_dict( 

24 self.issue_re, issue.text, error_msg="Couldn't parse issue data" 

25 ) 

26 

27 parent = issue.parent.parent 

28 a_tag = issue.parent 

29 issue_href = a_tag.get("href") 

30 if not isinstance(issue_href, str): 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true

31 raise ValueError("Couldn't parse issue url") 

32 

33 a_tag.decompose() 

34 article_count = cleanup_str(parent.text).removeprefix("[").removesuffix("]") 

35 if article_count == "0": 

36 continue 

37 

38 xissues.append( 

39 self.create_xissue( 

40 urljoin(self.collection_url, issue_href), 

41 issue_dict["year"], 

42 issue_dict["volume"], 

43 None, 

44 ) 

45 ) 

46 return xissues 

47 

48 def parse_issue_content(self, content, xissue): 

49 soup = BeautifulSoup(content, "html.parser") 

50 articles = soup.select("h4.artifact-title a") 

51 for index, article_tag in enumerate(articles): 

52 article_url = article_tag.get("href") 

53 if not isinstance(article_url, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 raise ValueError("Couldn't parse article data") 

55 xarticle = create_articledata() 

56 xarticle.pid = "a" + str(index) 

57 xarticle.url = urljoin(self.collection_url, article_url) 

58 xissue.articles.append(xarticle) 

59 

60 def parse_article_content(self, content, xissue, xarticle, url): 

61 soup = BeautifulSoup(content, "html.parser") 

62 self.get_metadata_using_citation_meta( 

63 xarticle, xissue, soup, ["title", "publisher", "lang", "author"] 

64 ) 

65 

66 pdf_link_tag = soup.select( 

67 ".item-page-field-wrapper > div > a[href^='/xmlui/bitstream/handle']" 

68 ) 

69 if len(pdf_link_tag) != 1: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 raise ValueError("Error while trying to parse pdf url : found multiple <a> candidates") 

71 pdf_link = pdf_link_tag[0].get("href") 

72 if not isinstance(pdf_link, str): 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 raise ValueError("Couldn't parse article pdf") 

74 add_pdf_link_to_xarticle(xarticle, urljoin(url, pdf_link)) 

75 

76 self.get_metadata_using_dcterms(soup, xarticle, ("abstract", "keywords")) 

77 

78 return xarticle 

79 

80 def get_metadata_using_dcterms(self, soup: Tag, xarticle: ArticleData, what): 

81 if "abstract" in what: 81 ↛ 91line 81 didn't jump to line 91 because the condition on line 81 was always true

82 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']") 

83 if abstract_tag: 83 ↛ 91line 83 didn't jump to line 91 because the condition on line 83 was always true

84 abstract_text = abstract_tag.get("content") 

85 if isinstance(abstract_text, str): 85 ↛ 91line 85 didn't jump to line 91 because the condition on line 85 was always true

86 xabstract = create_abstract( 

87 lang="en", tag="abstract", value_tex=cleanup_str(abstract_text) 

88 ) 

89 xarticle.abstracts.append(xabstract) 

90 

91 if "keywords" in what: 91 ↛ exitline 91 didn't return from function 'get_metadata_using_dcterms' because the condition on line 91 was always true

92 keyword_tags = soup.select("meta[name='DC.subject']") 

93 for tag in keyword_tags: 

94 kwd_text = tag.get("content") 

95 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 continue 

97 kwd = create_subj(value=kwd_text) 

98 xarticle.kwds.append(kwd)