Coverage for src/crawler/by_source/journalfi_crawler.py: 81%

93 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1import regex 

2from bs4 import BeautifulSoup 

3from ptf.model_data import IssueData, create_abstract, create_articledata 

4 

5from crawler.base_crawler import BaseCollectionCrawler 

6from crawler.utils import cleanup_str 

7 

8 

9class JournalfiCrawler(BaseCollectionCrawler): 

10 source_name = "Journal.fi" 

11 source_domain = "JOURNALFI" 

12 source_website = "https://journal.fi/" 

13 

14 delimiter_inline_formula = "\\(" 

15 

16 issue_re = r"Vol\. (?P<volume>\d+) No\. (?P<number>\d+) \((?P<year>\d+)\)" 

17 issue_re_2 = r"Volume (?P<volume>[\d\-]+), (?P<year>\d+)" 

18 issue_re_3 = r"Numbers (?P<volume>\d+\-\d+), (?P<year>\d+)" 

19 

20 def parse_collection_content(self, content): 

21 print("Collection parsing may take some time, please wait") 

22 xissues = self.parse_journalfi_collection_content(content) 

23 return list(xissues.values()) 

24 

25 def parse_journalfi_collection_content(self, content): 

26 soup = BeautifulSoup(content, "html.parser") 

27 issues = soup.select("a.title") 

28 xissues: dict[str, IssueData] = {} 

29 for issue in issues: 

30 issue_title_str = cleanup_str(issue.text) 

31 issue_search = regex.search(self.issue_re, issue_title_str) 

32 

33 # try issue_re_2 

34 if not issue_search: 

35 issue_search = regex.search(self.issue_re_2, issue_title_str) 

36 if issue_search: 

37 issue_title_tag = issue.parent.select_one("div.series") 

38 title_str = cleanup_str(issue_title_tag.text) 

39 issue_search = regex.search(self.issue_re, title_str) 

40 

41 # try issue_re_3 

42 if not issue_search: 

43 issue_search = regex.search(self.issue_re_3, issue_title_str) 

44 

45 if not issue_search: 

46 print(f"Couldn't parse issue regex for issue : {issue_title_str}") 

47 continue 

48 

49 issue_url = issue.get("href") 

50 issue_dict = issue_search.groupdict() 

51 if not isinstance(issue_url, str): 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 raise ValueError("Cannot parse issue url") 

53 xissue = self.create_xissue( 

54 issue_url, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None) 

55 ) 

56 if not xissue.pid: 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 raise ValueError("xissue.pid is None") 

58 

59 # HACK 

60 # Create article data in advance because we won't be able to access article urls ( 

61 # https://afm.journal.fi/issue/view/10338) 

62 if issue_search.re.pattern == self.issue_re_3: 

63 issue_content = self.download_file(issue_url) 

64 self.parse_issue_content(issue_content, xissue) 

65 xissue.url = None 

66 if xissue.pid not in xissues: 

67 xissues[xissue.pid] = xissue 

68 else: 

69 xissues[xissue.pid].articles.extend(xissue.articles) 

70 

71 next_page = soup.select_one("a.next") 

72 if not next_page: 

73 return xissues 

74 

75 next_url = next_page.get("href") 

76 if not isinstance(next_url, str): 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true

77 raise ValueError("Couldn't parse issue pagination") 

78 next_content = self.download_file(next_url) 

79 other_xissues = self.parse_journalfi_collection_content(next_content) 

80 for issue_pid in other_xissues: 

81 if issue_pid in xissues: 

82 xissues[issue_pid].articles.extend(other_xissues[issue_pid].articles) 

83 else: 

84 xissues[issue_pid] = other_xissues[issue_pid] 

85 

86 for xissue in xissues.values(): 

87 for index, article in enumerate(xissue.articles): 

88 article.pid = "a" + str(index) 

89 return xissues 

90 

91 def parse_issue_content(self, content, xissue): 

92 soup = BeautifulSoup(content, "html.parser") 

93 articles = soup.select("a[id^='article']") 

94 for index, article_tag in enumerate(articles): 

95 article_url = article_tag.get("href") 

96 if not isinstance(article_url, str): 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 raise ValueError("Couldn't parse Article url") 

98 xarticle = create_articledata() 

99 xarticle.pid = "a" + str(index) 

100 xarticle.url = article_url 

101 xissue.articles.append(xarticle) 

102 

103 def parse_article_content(self, content, xissue, xarticle, url, pid): 

104 soup = BeautifulSoup(content, "html.parser") 

105 self.get_metadata_using_citation_meta( 

106 xarticle, xissue, soup, ["author", "title", "lang", "page", "doi", "keywords", "pdf"] 

107 ) 

108 abstract_tag = soup.select_one("meta[name='DC.Description']") 

109 if abstract_tag: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 abstarct_lang = xarticle.lang 

111 

112 abstract_lang_str = abstract_tag.get("xml:lang") 

113 if isinstance(abstract_lang_str, str): 

114 abstarct_lang = abstract_lang_str 

115 del abstract_lang_str 

116 

117 abstract_content = abstract_tag.get("content") 

118 if isinstance(abstract_content, str): 

119 xarticle.abstracts.append( 

120 create_abstract(lang=abstarct_lang, tag="abstract", value_tex=abstract_content) 

121 ) 

122 if xarticle.title_tex == "": 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true

123 if pid == "AFM_2003_28_1_a10": 

124 xarticle.title_tex = "$\\delta$-stable Fuchsian groups" 

125 return super().parse_article_content(content, xissue, xarticle, url, pid)