Coverage for src/crawler/by_source/jgaa_crawler.py: 87%

58 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1from bs4 import BeautifulSoup 

2from ptf.model_data import create_abstract, create_articledata 

3 

4from crawler.base_crawler import BaseCollectionCrawler 

5from crawler.utils import cleanup_str, regex_to_dict 

6 

7 

8# https://jgaa.info/index.php/jgaa/oai?verb=ListRecords&metadataPrefix=oai_dc 

9class JgaaCrawler(BaseCollectionCrawler): 

10 source_name = "Journal of Graph Algorythms and Applications website" 

11 source_domain = "JGAA" 

12 source_website = "https://jgaa.info/" 

13 

14 issue_re = r"Vol\. (?P<volume>\d+) No\. (?P<number>\d+) \((?P<year>\d+)\)" 

15 

16 def parse_collection_content(self, content): 

17 xissues = [] 

18 soup = BeautifulSoup(content, "html.parser") 

19 issues = soup.select("div.obj_issue_summary > h2") 

20 for issue in issues: 

21 issue_title_str = "" 

22 

23 series_title = issue.select_one("a.title") 

24 if not series_title: 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true

25 raise ValueError("Couldn't find issue title") 

26 issue_href = series_title.get("href") 

27 if not isinstance(issue_href, str): 27 ↛ 28line 27 didn't jump to line 28 because the condition on line 27 was never true

28 raise ValueError("Couldn't parse issue url") 

29 

30 series_tag = issue.select_one("div.series") 

31 if series_tag: 

32 issue_title_str = series_title.text 

33 else: 

34 series_tag = series_title 

35 

36 issue_dict = regex_to_dict( 

37 self.issue_re, 

38 series_tag.text, 

39 error_msg="Couldn't parse volume year", 

40 ) 

41 

42 xissue = self.create_xissue( 

43 issue_href, 

44 issue_dict["year"], 

45 issue_dict["volume"], 

46 issue_dict["number"], 

47 ) 

48 xissue.title_tex = issue_title_str 

49 xissues.append(xissue) 

50 

51 paginator = soup.select_one("a.next") 

52 if paginator: 

53 next_page_url = paginator.get("href") 

54 if not isinstance(next_page_url, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 raise ValueError("Couldn't parse pagination url") 

56 next_page_content = self.download_file(next_page_url) 

57 xissues.extend(self.parse_collection_content(next_page_content)) 

58 

59 return xissues 

60 

61 def parse_issue_content(self, content, xissue): 

62 soup = BeautifulSoup(content, "html.parser") 

63 articles = soup.select("a[id^='article']") 

64 for index, article_tag in enumerate(articles): 

65 article_url = article_tag.get("href") 

66 if not isinstance(article_url, str): 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 raise ValueError("Couldn't parse article url") 

68 xarticle = create_articledata() 

69 xarticle.pid = "a" + str(index) 

70 xarticle.url = article_url 

71 xissue.articles.append(xarticle) 

72 

73 def parse_article_content(self, content, xissue, xarticle, url): 

74 soup = BeautifulSoup(content, "html.parser") 

75 self.get_metadata_using_citation_meta( 

76 xarticle, xissue, soup, ["author", "lang", "pdf", "page", "doi", "title", "keywords"] 

77 ) 

78 

79 abstract_tag = soup.select_one("section.item.abstract") 

80 if abstract_tag: 80 ↛ 89line 80 didn't jump to line 89 because the condition on line 80 was always true

81 label_tag = abstract_tag.select_one("h2.label") 

82 if label_tag: 82 ↛ 84line 82 didn't jump to line 84 because the condition on line 82 was always true

83 label_tag.decompose() 

84 xarticle.abstracts.append( 

85 create_abstract( 

86 tag="abstract", value_tex=cleanup_str(abstract_tag.text), lang=xarticle.lang 

87 ) 

88 ) 

89 return xarticle