Coverage for src/crawler/by_source/journalfi_crawler.py: 81%
93 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1import regex
2from bs4 import BeautifulSoup
3from ptf.model_data import IssueData, create_abstract, create_articledata
5from crawler.base_crawler import BaseCollectionCrawler
6from crawler.utils import cleanup_str
9class JournalfiCrawler(BaseCollectionCrawler):
10 source_name = "Journal.fi"
11 source_domain = "JOURNALFI"
12 source_website = "https://journal.fi/"
14 delimiter_inline_formula = "\\("
16 issue_re = r"Vol\. (?P<volume>\d+) No\. (?P<number>\d+) \((?P<year>\d+)\)"
17 issue_re_2 = r"Volume (?P<volume>[\d\-]+), (?P<year>\d+)"
18 issue_re_3 = r"Numbers (?P<volume>\d+\-\d+), (?P<year>\d+)"
20 def parse_collection_content(self, content):
21 print("Collection parsing may take some time, please wait")
22 xissues = self.parse_journalfi_collection_content(content)
23 return list(xissues.values())
25 def parse_journalfi_collection_content(self, content):
26 soup = BeautifulSoup(content, "html.parser")
27 issues = soup.select("a.title")
28 xissues: dict[str, IssueData] = {}
29 for issue in issues:
30 issue_title_str = cleanup_str(issue.text)
31 issue_search = regex.search(self.issue_re, issue_title_str)
33 # try issue_re_2
34 if not issue_search:
35 issue_search = regex.search(self.issue_re_2, issue_title_str)
36 if issue_search:
37 issue_title_tag = issue.parent.select_one("div.series")
38 title_str = cleanup_str(issue_title_tag.text)
39 issue_search = regex.search(self.issue_re, title_str)
41 # try issue_re_3
42 if not issue_search:
43 issue_search = regex.search(self.issue_re_3, issue_title_str)
45 if not issue_search:
46 print(f"Couldn't parse issue regex for issue : {issue_title_str}")
47 continue
49 issue_url = issue.get("href")
50 issue_dict = issue_search.groupdict()
51 if not isinstance(issue_url, str): 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 raise ValueError("Cannot parse issue url")
53 xissue = self.create_xissue(
54 issue_url, issue_dict["year"], issue_dict["volume"], issue_dict.get("number", None)
55 )
56 if not xissue.pid: 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true
57 raise ValueError("xissue.pid is None")
59 # HACK
60 # Create article data in advance because we won't be able to access article urls (
61 # https://afm.journal.fi/issue/view/10338)
62 if issue_search.re.pattern == self.issue_re_3:
63 issue_content = self.download_file(issue_url)
64 self.parse_issue_content(issue_content, xissue)
65 xissue.url = None
66 if xissue.pid not in xissues:
67 xissues[xissue.pid] = xissue
68 else:
69 xissues[xissue.pid].articles.extend(xissue.articles)
71 next_page = soup.select_one("a.next")
72 if not next_page:
73 return xissues
75 next_url = next_page.get("href")
76 if not isinstance(next_url, str): 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true
77 raise ValueError("Couldn't parse issue pagination")
78 next_content = self.download_file(next_url)
79 other_xissues = self.parse_journalfi_collection_content(next_content)
80 for issue_pid in other_xissues:
81 if issue_pid in xissues:
82 xissues[issue_pid].articles.extend(other_xissues[issue_pid].articles)
83 else:
84 xissues[issue_pid] = other_xissues[issue_pid]
86 for xissue in xissues.values():
87 for index, article in enumerate(xissue.articles):
88 article.pid = "a" + str(index)
89 return xissues
91 def parse_issue_content(self, content, xissue):
92 soup = BeautifulSoup(content, "html.parser")
93 articles = soup.select("a[id^='article']")
94 for index, article_tag in enumerate(articles):
95 article_url = article_tag.get("href")
96 if not isinstance(article_url, str): 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true
97 raise ValueError("Couldn't parse Article url")
98 xarticle = create_articledata()
99 xarticle.pid = "a" + str(index)
100 xarticle.url = article_url
101 xissue.articles.append(xarticle)
103 def parse_article_content(self, content, xissue, xarticle, url, pid):
104 soup = BeautifulSoup(content, "html.parser")
105 self.get_metadata_using_citation_meta(
106 xarticle, xissue, soup, ["author", "title", "lang", "page", "doi", "keywords", "pdf"]
107 )
108 abstract_tag = soup.select_one("meta[name='DC.Description']")
109 if abstract_tag: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 abstarct_lang = xarticle.lang
112 abstract_lang_str = abstract_tag.get("xml:lang")
113 if isinstance(abstract_lang_str, str):
114 abstarct_lang = abstract_lang_str
115 del abstract_lang_str
117 abstract_content = abstract_tag.get("content")
118 if isinstance(abstract_content, str):
119 xarticle.abstracts.append(
120 create_abstract(lang=abstarct_lang, tag="abstract", value_tex=abstract_content)
121 )
122 if xarticle.title_tex == "": 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true
123 if pid == "AFM_2003_28_1_a10":
124 xarticle.title_tex = "$\\delta$-stable Fuchsian groups"
125 return super().parse_article_content(content, xissue, xarticle, url, pid)