Coverage for src/crawler/by_source/aulfm_crawler.py: 86%

64 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup, Tag 

5from ptf.model_data import ArticleData, create_abstract, create_articledata, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import cleanup_str 

9 

10 

11class AulfmCrawler(BaseCollectionCrawler): 

12 source_name = "University of Lodz Repository" 

13 source_domain = "AULFM" 

14 source_website = "https://dspace.uni.lodz.pl/xmlui/" 

15 

16 issue_re = r".+ vol. (?P<volume>\d+)\/(?P<year>\d+)" 

17 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)" 

18 

19 def parse_collection_content(self, content): 

20 xissues = [] 

21 soup = BeautifulSoup(content, "html.parser") 

22 issues = soup.select("h4.artifact-title a span.Z3988") 

23 for issue in issues: 

24 issue_search = regex.search(self.issue_re, issue.text) 

25 if not issue_search: 25 ↛ 26line 25 didn't jump to line 26 because the condition on line 25 was never true

26 raise ValueError("Couldn't parse issue data") 

27 issue_dict = issue_search.groupdict() 

28 

29 parent = issue.parent.parent 

30 a_tag = issue.parent 

31 issue_href = a_tag.get("href") 

32 if not isinstance(issue_href, str): 32 ↛ 33line 32 didn't jump to line 33 because the condition on line 32 was never true

33 raise ValueError("Couldn't parse issue url") 

34 

35 a_tag.decompose() 

36 article_count = cleanup_str(parent.text).removeprefix("[").removesuffix("]") 

37 if article_count == "0": 

38 continue 

39 

40 xissues.append( 

41 self.create_xissue( 

42 urljoin(self.collection_url, issue_href), 

43 issue_dict["year"], 

44 issue_dict["volume"], 

45 None, 

46 ) 

47 ) 

48 return xissues 

49 

50 def parse_issue_content(self, content, xissue): 

51 soup = BeautifulSoup(content, "html.parser") 

52 articles = soup.select("h4.artifact-title a") 

53 for index, article_tag in enumerate(articles): 

54 article_url = article_tag.get("href") 

55 if not isinstance(article_url, str): 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 raise ValueError("Couldn't parse article data") 

57 xarticle = create_articledata() 

58 xarticle.pid = "a" + str(index) 

59 xarticle.url = urljoin(self.collection_url, article_url) 

60 xissue.articles.append(xarticle) 

61 

62 def parse_article_content(self, content, xissue, xarticle, url, pid): 

63 soup = BeautifulSoup(content, "html.parser") 

64 self.get_metadata_using_citation_meta( 

65 xarticle, xissue, soup, ["title", "publisher", "lang", "author", "pdf"] 

66 ) 

67 self.get_metadata_using_dcterms(soup, xarticle, ("abstract", "keywords")) 

68 

69 return super().parse_article_content(content, xissue, xarticle, url, pid) 

70 

71 def get_metadata_using_dcterms(self, soup: Tag, xarticle: ArticleData, what): 

72 if "abstract" in what: 72 ↛ 82line 72 didn't jump to line 82 because the condition on line 72 was always true

73 abstract_tag = soup.select_one("meta[name='DCTERMS.abstract']") 

74 if abstract_tag: 74 ↛ 82line 74 didn't jump to line 82 because the condition on line 74 was always true

75 abstract_text = abstract_tag.get("content") 

76 if isinstance(abstract_text, str): 76 ↛ 82line 76 didn't jump to line 82 because the condition on line 76 was always true

77 xabstract = create_abstract( 

78 lang="en", tag="abstract", value_tex=cleanup_str(abstract_text) 

79 ) 

80 xarticle.abstracts.append(xabstract) 

81 

82 if "keywords" in what: 82 ↛ exitline 82 didn't return from function 'get_metadata_using_dcterms' because the condition on line 82 was always true

83 keyword_tags = soup.select("meta[name='DC.subject']") 

84 for tag in keyword_tags: 

85 kwd_text = tag.get("content") 

86 if not isinstance(kwd_text, str) or len(kwd_text) == 0: 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true

87 continue 

88 kwd = create_subj(value=kwd_text) 

89 xarticle.kwds.append(kwd)