ååè¨äºã®ç¶ãã§ããçæ AI ãããã¤ã®éã«ãæ°ããå±é¢ãè¿ãã¦ããããã«æãã¾ããåãåãããããããã®æåæ°å¶éï¼æ£ç¢ºã«è¨ãã°ãã¼ã¯ã³ã®å¶éï¼ãå¤§å¹ ã«å¢ããåãåããã®éã«ãåæç¥èã¨ãã¦å¦è¡è«ææ°åæ¬ããããã¯æ°æ¸10åããããèªã¿è¾¼ã¾ãã¦ããåçããããã¨ãã§ããããã«ãªã£ã¦ãã¾ããããã¾ã§ã¯ãçæ AI ãæã¤ç¥èããåãåããå½¢ã«ãªã£ã¦ãã¾ããããããã«ãã£ã¦ãããã¡ããæã¤ç¥èãæ å ±ãçæ AI ã«èããããããã¨ãã§ããããã«ãªãã¤ã¤ããã¾ãããã®æµããããã«é²ãã°ãä»ã¾ã§ã¯ã§ããªãããã ã£ãæç¨æ§ãçºæ®ãããã¨ãã§ããããã«ãªããããããªããã¨ãããã¨ã§ãã¨ããããä»è©¦ãããã¨ãã¡ããã¡ããã¨ãã£ã¦ã¿ã¦ãã¾ãã
ã§ãååè¨äºãã¿ã人ãããJ-STAGEããPDFããã¦ã³ãã¼ãããæ¹æ³ã«ã¤ãã¦ã®ãªã¯ã¨ã¹ããããã¾ããã®ã§ãã¡ããã£ã¨æ¸ãã¦ã¿ã¾ãã
çæAIã«èªã¿è¾¼ã¾ããä¿¡é ¼ã§ããæ å ±æºã¨ãã¦ã®å¦è¡è«æã§ãããç¾å¨ã¯J-STAGEã§ããããã®ãªã¼ãã³ã¢ã¯ã»ã¹è«æãå ¬éããã¦ãã¾ãããããéèªåä½ã§ã¾ãã£ã¨å©ç¨ã§ããã°ããã®å¦ä¼ãæ±ã£ã¦ããåéã«é¢ããæ¯è¼çå°éæ§ã®é«ãçæAIã®RAGãæ§ç¯ã§ããããªæãã§ãããªããå½éã¸ã£ã¼ãã«ã§ããã°ãã£ã¨æ¥½ã ãã¨ãããã®æ¹ãããã®ã§ã¯ãªãããã¨ãã話ãããã®ã§ãããJ-STAGEã«è¼ã£ã¦ããå¦è¡éèªã«ã¯åºæã®è¯ããããã¾ãã®ã§ãããã¯ããã§éè¦ãªãã¨ã§ãã
J-STAGEã¯APIã§ã®ãã¼ã¿å ¬éãè¡ã£ã¦ãã¾ããéèªä¸ã¤ãã¾ã¨ãã¦ã¾ãã£ã¨ãã¦ã³ãã¼ããããæã¯ãããã使ãã®ã便å©ã§ãã
ã¨ããããã§ãJ-STAGEã®APIã使ã£ã¦ãä»»æã®éèªã®è«æPDFããã£ãããã¦ã³ãã¼ãããã¹ã¯ãªãããæ¸ãã¦ã¿ã¾ãã J-STAGEã«è¿·æãããããªãããã«ãç¹ã«ãå©ç¨è¦ç´ã¯ããèªãã§ãããå©ç¨ãã ãããè¦ç´ã§ã¯å¤§éãã¦ã³ãã¼ãã¯ç¦ãããã¦ãã¾ãã®ã§ã大éã«ãªããªãããã«ããµã¼ãã«å¤§ããªè² è·ãããã£ã¦ãã¾ããªãããã«ãããæ°ãã¤ãã¦ãã ããã
import requests from bs4 import BeautifulSoup import urllib.parse import os import re import sys import time def save_pdf(link, save_path): pdf_response = requests.get(link) content_disposition = pdf_response.headers.get('content-disposition') if content_disposition and 'filename=' in content_disposition: filename = content_disposition.split('filename=')[1] filename = filename.strip('"') pdf_filename = filename pdf_save_path = os.path.join(save_path, pdf_filename) #print (pdf_save_path) if not os.path.isfile(pdf_save_path): with open(pdf_save_path, 'wb') as pdf_file: pdf_response = requests.get(link) pdf_file.write(pdf_response.content) print(f"Downloaded: {pdf_filename}") else: print ('skip:') if len(sys.argv) > 2: save_directory = sys.argv[2] if not os.path.exists(save_directory): os.makedirs(save_directory) else: print ("ä¿åãã©ã«ãåã2çªç®ã®å¼æ°ã«æå®ãã¦ãã ããã") sys.exit() journalurlapi = 'https://api.jstage.jst.go.jp/searchapi/do?service=2&cdjournal='+sys.argv[1] response = requests.get(journalurlapi) soup = BeautifulSoup(response.content, 'xml') entries = soup.select("entry") for entry in entries: eachvol = entry.find_all('volume')[0].get_text() volurl = 'https://api.jstage.jst.go.jp/searchapi/do?service=3&cdjournal=jpbs&vol='+eachvol eresponse = requests.get(volurl) eachsoup = BeautifulSoup(eresponse.content, 'xml') eentries = eachsoup.select('entry') for eentry in eentries: elink = eentry.select('link')[0].get('href') pdflink = re.sub(r'_article/',r'_pdf/',elink) print (pdflink) save_pdf(pdflink, save_directory) time.sleep(1)
ãã®ã¹ã¯ãªããã¯ã以ä¸ã®ããã«å©ç¨ãã¾ãã
$ python ãã®ã¹ã¯ãªãã éèªã®ID ä¿åå ãã©ã«ãå
ããã§ãPDFãã¡ã¤ã«ã¯ä¸éãå ¥æã§ããã¯ãã§ãããããã¾ãããã次ã¯ãOCRããããããã«ããã®PDFããJPGç»åãå ¨é¨åãåºãã¾ãã ããããã®ã¯ Pythonã§ä¸æ¬å¦çããã®ã楽ã§ããã
from pdf2image import convert_from_path import os import glob import re pdfs = glob.glob("*.pdf") for pdf in pdfs: pdfpath = 'jpgs/'+os.path.splitext(pdf)[0] if not os.path.isdir(pdfpath): os.mkdir(pdfpath) images = convert_from_path(pdf) for i, image in enumerate(images): image_path = f"{pdfpath}/p_{i+1}.jpg" image.save(image_path, "JPEG") print(f"Page {i+1} saved as {image_path}")
ãããããã¾ã§ã§ãããã次ã¯OCRã§ãããããã¨ããã®ã¯ã¾ã次åã«ã