-
Notifications
You must be signed in to change notification settings - Fork 0
/
curlwwwtext.c
122 lines (101 loc) · 3.86 KB
/
curlwwwtext.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#include <stdio.h>
#include <curl/curl.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#define MAX_COMMAND_LENGTH 1024
#define MAX_ENCODING_LENGTH 32
size_t write_callback(void *contents, size_t size, size_t nmemb, void *userp) {
return fwrite(contents, size, nmemb, (FILE *)userp);
}
int detect_encoding(const char *filename, char *encoding, size_t encoding_size) {
char command[MAX_COMMAND_LENGTH];
FILE *pipe;
snprintf(command, sizeof(command), "file -i %s | awk -F'charset=' '{print $2}'", filename);
pipe = popen(command, "r");
if (!pipe) {
fprintf(stderr, "Error executing encoding detection command.\n");
return -1;
}
if (fgets(encoding, encoding_size, pipe) == NULL) {
fprintf(stderr, "Error reading encoding detection result.\n");
pclose(pipe);
return -1;
}
// Remove newline character if present
encoding[strcspn(encoding, "\n")] = 0;
pclose(pipe);
return 0;
}
int main(int argc, char *argv[]) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <URL> [--help]\n", argv[0]);
return 1;
}
if (strcmp(argv[1], "--help") == 0) {
printf("Help:\n");
printf("This program downloads the content from the given URL\n");
printf("and converts it to a readable text format.\n");
return 0;
}
const char *url = argv[1];
printf("URL: %s\n", url);
CURL *curl;
CURLcode res;
FILE *html_file;
char command[MAX_COMMAND_LENGTH];
char encoding[MAX_ENCODING_LENGTH];
curl = curl_easy_init();
if (!curl) {
fprintf(stderr, "Failed to initialize CURL\n");
return 1;
}
html_file = fopen("page.html", "wb");
if (!html_file) {
fprintf(stderr, "Failed to open page.html for writing: %s\n", strerror(errno));
curl_easy_cleanup(curl);
return 1;
}
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, html_file);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
res = curl_easy_perform(curl);
if (res != CURLE_OK) {
fprintf(stderr, "curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
}
curl_easy_cleanup(curl);
fclose(html_file);
if (res == CURLE_OK) {
if (detect_encoding("page.html", encoding, sizeof(encoding)) == 0) {
printf("Detected encoding: %s\n", encoding);
if (strcasecmp(encoding, "utf-8") != 0) {
// Only convert if the encoding is not UTF-8
snprintf(command, sizeof(command),
"iconv -f %s -t UTF-8 page.html | "
"html2markdown --no-wrap-links --ignore-tables --ignore-images --ignore-links | "
"grep -v '[[:digit:]]' | "
"sed -E 's/###/Infos/g' | sed -E 's/##/Actus/g' | sed -E 's/#/Titre/g' | "
"grep -v '* ' | grep -v 'Publicité'",
encoding);
} else {
// If it's already UTF-8, skip the iconv step
snprintf(command, sizeof(command),
"html2markdown --no-wrap-links --ignore-tables --ignore-images --ignore-links page.html | "
"grep -v '[[:digit:]]' | "
"sed -E 's/###/Infos/g' | sed -E 's/##/Actus/g' | sed -E 's/#/Titre/g' | "
"grep -v '* ' | grep -v 'Publicité'");
}
int result = system(command);
if (result == -1) {
fprintf(stderr, "Failed to execute command: %s\n", strerror(errno));
return 1;
}
} else {
fprintf(stderr, "Failed to detect encoding\n");
return 1;
}
}
return 0;
}