Coverage for src/crawler/by_source/jgaa_crawler.py: 86%

62 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1import regex 

2from bs4 import BeautifulSoup 

3from ptf.model_data import create_abstract, create_articledata 

4 

5from crawler.base_crawler import BaseCollectionCrawler 

6from crawler.utils import cleanup_str 

7 

8 

9# https://jgaa.info/index.php/jgaa/oai?verb=ListRecords&metadataPrefix=oai_dc 

10class JgaaCrawler(BaseCollectionCrawler): 

11 source_name = "Journal of Graph Algorythms and Applications website" 

12 source_domain = "JGAA" 

13 source_website = "https://jgaa.info/" 

14 

15 issue_re = r"Vol\. (?P<volume>\d+) No\. (?P<number>\d+) \((?P<year>\d+)\)" 

16 

17 def parse_collection_content(self, content): 

18 xissues = [] 

19 soup = BeautifulSoup(content, "html.parser") 

20 issues = soup.select("div.obj_issue_summary > h2") 

21 for issue in issues: 

22 issue_title_str = "" 

23 

24 series_title = issue.select_one("a.title") 

25 if not series_title: 25 ↛ 26line 25 didn't jump to line 26 because the condition on line 25 was never true

26 raise ValueError("Couldn't find issue title") 

27 issue_href = series_title.get("href") 

28 if not isinstance(issue_href, str): 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true

29 raise ValueError("Couldn't parse issue url") 

30 

31 series_tag = issue.select_one("div.series") 

32 if series_tag: 

33 issue_title_str = series_title.text 

34 else: 

35 series_tag = series_title 

36 

37 issue_search = regex.search(self.issue_re, series_tag.text) 

38 if not issue_search: 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 raise ValueError("Couldn't parse volume year") 

40 issue_dict = issue_search.groupdict() 

41 

42 xissue = self.create_xissue( 

43 issue_href, 

44 issue_dict["year"], 

45 issue_dict["volume"], 

46 issue_dict["number"], 

47 ) 

48 xissue.title_tex = issue_title_str 

49 xissues.append(xissue) 

50 

51 paginator = soup.select_one("a.next") 

52 if paginator: 

53 next_page_url = paginator.get("href") 

54 if not isinstance(next_page_url, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 raise ValueError("Couldn't parse pagination url") 

56 next_page_content = self.download_file(next_page_url) 

57 xissues.extend(self.parse_collection_content(next_page_content)) 

58 

59 return xissues 

60 

61 def parse_issue_content(self, content, xissue): 

62 soup = BeautifulSoup(content, "html.parser") 

63 articles = soup.select("a[id^='article']") 

64 for index, article_tag in enumerate(articles): 

65 article_url = article_tag.get("href") 

66 if not isinstance(article_url, str): 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 raise ValueError("Couldn't parse article url") 

68 xarticle = create_articledata() 

69 xarticle.pid = "a" + str(index) 

70 xarticle.url = article_url 

71 xissue.articles.append(xarticle) 

72 

73 def parse_article_content(self, content, xissue, xarticle, url, pid): 

74 soup = BeautifulSoup(content, "html.parser") 

75 self.get_metadata_using_citation_meta( 

76 xarticle, xissue, soup, ["author", "lang", "pdf", "page", "doi", "title", "keywords"] 

77 ) 

78 

79 abstract_tag = soup.select_one("section.item.abstract") 

80 if abstract_tag: 80 ↛ 89line 80 didn't jump to line 89 because the condition on line 80 was always true

81 label_tag = abstract_tag.select_one("h2.label") 

82 if label_tag: 82 ↛ 84line 82 didn't jump to line 84 because the condition on line 82 was always true

83 label_tag.decompose() 

84 xarticle.abstracts.append( 

85 create_abstract( 

86 tag="abstract", value_tex=cleanup_str(abstract_tag.text), lang=xarticle.lang 

87 ) 

88 ) 

89 return super().parse_article_content(content, xissue, xarticle, url, pid)