import { ConfigurationError } from "@pipedream/platform";
import fs from "fs";
import { checkTmp } from "../../common/utils.mjs";
import scrapfly from "../../scrapfly.app.mjs";
export default {
key: "scrapfly-ai-data-extraction",
name: "AI Data Extraction",
description: "Automate content extraction from any text-based source using AI, LLM, and custom parsing. [See the documentation](https://scrapfly.io/docs/extraction-api/getting-started)",
version: "0.0.1",
type: "action",
props: {
scrapfly,
body: {
propDefinition: [
scrapfly,
"body",
],
},
contentType: {
propDefinition: [
scrapfly,
"contentType",
],
},
url: {
propDefinition: [
scrapfly,
"url",
],
},
charset: {
type: "string",
label: "Charset",
description: "Charset of the document pass in the body. If you are not sure, you can use the `auto` value and we will try to detect it. Bad charset can lead to bad extraction, so it's important to set it correctly. **The most common charset is `utf-8` for text document and `ascii` for binary**. The symptom of a bad charset is that the text is not correctly displayed (accent, special characters, etc).",
default: "auto",
optional: true,
},
extractionTemplate: {
type: "string",
label: "Extraction Template",
description: "Define an extraction template to get structured data. Use an ephemeral template (declared on the fly on the API call) or a stored template (declared in the dashboard) by using the template name.",
optional: true,
},
extractionPrompt: {
type: "string",
label: "Extraction Prompt",
description: "Instruction to extract data or ask a question on the scraped content with an LLM (Large Language Model). [Must be url encoded](https://scrapfly.io/web-scraping-tools/urlencode).",
optional: true,
},
extractionModel: {
type: "string",
label: "Extraction Model",
description: "AI Extraction to auto parse document to get structured data. E.g., `product`, `review`, `real-estate`, `article`.",
optional: true,
},
webhookName: {
type: "string",
label: "Webhook Name",
description: "Queue you scrape request and redirect API response to a provided webhook endpoint. You can create a webhook endpoint from your `dashboard`, it takes the name of the webhook. Webhooks are scoped to the given project/env.",
optional: true,
},
},
async run({ $ }) {
if (!this.extractionTemplate && !this.extractionPrompt && !this.extractionModel) {
throw new ConfigurationError("You must provide at least **Extraction Template**, **Extraction Prompt** or **Extraction Model**");
}
const response = await this.scrapfly.automateContentExtraction({
$,
headers: {
"content-type": this.contentType,
},
maxBodyLength: Infinity,
params: {
url: this.url,
charset: this.charset,
extraction_template: this.extractionTemplate,
extraction_prompt: this.extractionPrompt,
extraction_model: this.extractionModel,
webhook_name: this.webhookName,
},
data: fs.readFileSync(checkTmp(this.body)).toString(),
});
$.export("$summary", "Successfully extracted content");
return response;
},
};