Coverage for src/crawler/by_source/j_stage

1import logging

3from bs4 import BeautifulSoup

4from ptf.model_data import create_abstract, create_articledata

6from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler

7from crawler.utils import regex_to_dict

9logger = logging.getLogger(__name__)

12class JStageCrawler(BaseCollectionCrawler):

13 source_name = "Funkcialaj Ekvacioj"

14 source_domain = "J_STAGE"

15 source_website = "https://www.jstage.jst.go.jp/browse/fesi/list/-char/en"

17 issue_url = "https://da.lib.kobe-u.ac.jp/da/kernel/search/{issue_id}/?lang=1&cate_schema=30000&mode=0&cflg=1&codeno=002"

18 article_url = "https://da.lib.kobe-u.ac.jp/da/kernel/{article_id}/?lang=1&mode=0&opkey=R178100158613792&idx=1&chk_schema=30000&cate_schema=30000&cflg=1&codeno=002&fc_val=&chk_st=0&check=00"

19 article_url_paginate = "https://da.lib.kobe-u.ac.jp/da/kernel/search/simple/?lang=1&mode=0&opkey={opkey}&chk_schema=30000&list_sort=8&disp_cnt=500&con_kywd=&cate_schema=30000&fc_val=&req=dispchg&chk_st=20&check=00000000000000"

21 issue_category_click_re = r"categoryClick\('(?P<issue_id>\d+)'[^)]*\)"

22 article_detail_click_re = r"detailClick\('kernel', '(?P<article_id>[^']+)"

23 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)"

24 issue_re = r"Issue (?P<issue>\d+) Pages (?P<page>[^-]+)"

25 volume_re_error = r"(?P<volume>\d+)巻\uff08(?P<year>\d+)～(?P<year_next>\d+)\uff09"

26 opkey_re = r"opkey=(?P<opkey>[^&]+)"

28 def parse_collection_content(self, content):

29 xissues = []

30 soup = BeautifulSoup(content, "html.parser")

31 volumes_tag = soup.select(

32 "div.facetsearch-content-wrap div.facetsearch-subheader div.facetsearch-content-innerwrap"

33 )

34 for volume_tag in volumes_tag:

35 volume_title_tag = volume_tag.parent.select_one("span.facetsearch-subheader-link a")

36 if not volume_title_tag:

37 continue

38 volume_text = volume_title_tag.text

39 if volume_text is None:

40 raise ValueError("Couldn't parse volume tag")

41 volume_group = regex_to_dict(

42 self.volume_re, volume_text, error_msg="Couldn't parse volume number & year"

43 )

45 issues_tag = volume_tag.select("ul li a")

46 for issue_tag in issues_tag:

47 issue_url = issue_tag.get("href")

48 if not issue_url:

49 raise ValueError("Couldn't find issue url")

51 issue_text = issue_tag.text

52 if not issue_text:

53 raise ValueError("Couldn't parse issue number")

54 try:

55 issue_group = regex_to_dict(

56 self.issue_re, issue_text, error_msg="Couldn't parse volume number & year"

57 )

58 except ValueError:

59 raise ValueError("Couldn't parse issue number")

61 try:

62 xissues.append(

63 self.create_xissue(

64 issue_url,

65 volume_group.get("year"),

66 volume_number=volume_group.get("volume"),

67 issue_number=str(issue_group.get("issue")),

68 )

69 )

70 except TypeError:

71 print("can't create issue")

72 return xissues

74 def parse_issue_content(self, content, xissue):

75 soup = BeautifulSoup(content, "html.parser")

76 # On 12/06/2026 no managing of pagination required

77 articles_tag = soup.select("ul.search-resultslisting li")

78 for article_tag in articles_tag:

79 xarticle = create_articledata()

80 article_url_tag = article_tag.select_one("div.searchlist-title a")

81 if not article_url_tag:

82 raise ValueError(f"Couldn't parse article url for issue {xissue.url}")

83 article_url = article_url_tag.get("href")

85 xarticle.url = article_url

86 xissue.articles.append(xarticle)

88 def parse_article_content(self, content, xissue, xarticle, url):

89 soup = BeautifulSoup(content, "html.parser")

91 self.get_metadata_using_citation_meta(

92 xarticle,

93 xissue,

94 soup,

95 [

96 "pdf",

97 "title",

98 "author",

99 "doi",

100 "publisher",

101 "lang",

102 "keywords",

103 "reference",

104 ],

105 )

106

107 abstract_tag = soup(text="Abstract")

108 if abstract_tag and len(abstract_tag) > 0:

109 abstract = abstract_tag[1].parent.parent.select_one("div.global-para-14").text

110 xarticle.abstracts.append(create_abstract(lang=xarticle.lang, value_tex=abstract))

111 return xarticle

Coverage for src / crawler / by_source / j_stage_crawler.py: 26%

67 statements