Coverage for src / crawler / by_source / j_stage_crawler.py: 26%
67 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
1import logging
3from bs4 import BeautifulSoup
4from ptf.model_data import create_abstract, create_articledata
6from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler
7from crawler.utils import regex_to_dict
9logger = logging.getLogger(__name__)
12class JStageCrawler(BaseCollectionCrawler):
13 source_name = "Funkcialaj Ekvacioj"
14 source_domain = "J_STAGE"
15 source_website = "https://www.jstage.jst.go.jp/browse/fesi/list/-char/en"
17 issue_url = "https://da.lib.kobe-u.ac.jp/da/kernel/search/{issue_id}/?lang=1&cate_schema=30000&mode=0&cflg=1&codeno=002"
18 article_url = "https://da.lib.kobe-u.ac.jp/da/kernel/{article_id}/?lang=1&mode=0&opkey=R178100158613792&idx=1&chk_schema=30000&cate_schema=30000&cflg=1&codeno=002&fc_val=&chk_st=0&check=00"
19 article_url_paginate = "https://da.lib.kobe-u.ac.jp/da/kernel/search/simple/?lang=1&mode=0&opkey={opkey}&chk_schema=30000&list_sort=8&disp_cnt=500&con_kywd=&cate_schema=30000&fc_val=&req=dispchg&chk_st=20&check=00000000000000"
21 issue_category_click_re = r"categoryClick\('(?P<issue_id>\d+)'[^)]*\)"
22 article_detail_click_re = r"detailClick\('kernel', '(?P<article_id>[^']+)"
23 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)"
24 issue_re = r"Issue (?P<issue>\d+) Pages (?P<page>[^-]+)"
25 volume_re_error = r"(?P<volume>\d+)巻\uff08(?P<year>\d+)~(?P<year_next>\d+)\uff09"
26 opkey_re = r"opkey=(?P<opkey>[^&]+)"
28 def parse_collection_content(self, content):
29 xissues = []
30 soup = BeautifulSoup(content, "html.parser")
31 volumes_tag = soup.select(
32 "div.facetsearch-content-wrap div.facetsearch-subheader div.facetsearch-content-innerwrap"
33 )
34 for volume_tag in volumes_tag:
35 volume_title_tag = volume_tag.parent.select_one("span.facetsearch-subheader-link a")
36 if not volume_title_tag:
37 continue
38 volume_text = volume_title_tag.text
39 if volume_text is None:
40 raise ValueError("Couldn't parse volume tag")
41 volume_group = regex_to_dict(
42 self.volume_re, volume_text, error_msg="Couldn't parse volume number & year"
43 )
45 issues_tag = volume_tag.select("ul li a")
46 for issue_tag in issues_tag:
47 issue_url = issue_tag.get("href")
48 if not issue_url:
49 raise ValueError("Couldn't find issue url")
51 issue_text = issue_tag.text
52 if not issue_text:
53 raise ValueError("Couldn't parse issue number")
54 try:
55 issue_group = regex_to_dict(
56 self.issue_re, issue_text, error_msg="Couldn't parse volume number & year"
57 )
58 except ValueError:
59 raise ValueError("Couldn't parse issue number")
61 try:
62 xissues.append(
63 self.create_xissue(
64 issue_url,
65 volume_group.get("year"),
66 volume_number=volume_group.get("volume"),
67 issue_number=str(issue_group.get("issue")),
68 )
69 )
70 except TypeError:
71 print("can't create issue")
72 return xissues
74 def parse_issue_content(self, content, xissue):
75 soup = BeautifulSoup(content, "html.parser")
76 # On 12/06/2026 no managing of pagination required
77 articles_tag = soup.select("ul.search-resultslisting li")
78 for article_tag in articles_tag:
79 xarticle = create_articledata()
80 article_url_tag = article_tag.select_one("div.searchlist-title a")
81 if not article_url_tag:
82 raise ValueError(f"Couldn't parse article url for issue {xissue.url}")
83 article_url = article_url_tag.get("href")
85 xarticle.url = article_url
86 xissue.articles.append(xarticle)
88 def parse_article_content(self, content, xissue, xarticle, url):
89 soup = BeautifulSoup(content, "html.parser")
91 self.get_metadata_using_citation_meta(
92 xarticle,
93 xissue,
94 soup,
95 [
96 "pdf",
97 "title",
98 "author",
99 "doi",
100 "publisher",
101 "lang",
102 "keywords",
103 "reference",
104 ],
105 )
107 abstract_tag = soup(text="Abstract")
108 if abstract_tag and len(abstract_tag) > 0:
109 abstract = abstract_tag[1].parent.parent.select_one("div.global-para-14").text
110 xarticle.abstracts.append(create_abstract(lang=xarticle.lang, value_tex=abstract))
111 return xarticle