Web scraping

De Banane Atomic
Aller à la navigationAller à la recherche

Html Agility Pack

Cs.svg
using HtmlAgilityPack;  // nuget HtmlAgilityPack

HttpClient httpClient = new HttpClient();
var url = $"https://www.google.com/search?q={query.Replace(" ", "+")}";
var response = await httpClient.GetAsync(url);
var htmlContent = await response.Content.ReadAsStringAsync();

// parse html    
var pageDocument = new HtmlDocument();
pageDocument.LoadHtml(htmlContent);

// requete XPath
var node = pageDocument.DocumentNode.SelectSingleNode("//div[contains(@id, 'my-id')]");
var nodes = pageDocument.DocumentNode.SelectNodes("//div[contains(@id, 'myId')]//div[contains(@class, 'myClass')][1]//li");

API:Search Wikipedia

API:Opensearch

Cs.svg
var url = $"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={query.Replace(" ", "+")}&srlimit=1&srnamespace=0&format=json";
var response = await httpClient.GetAsync(url);
var jsonResult = await response.Content.ReadAsAsync<JObject>();
var title = jsonResult.SelectToken("query.search[0].title").Value<string>();

url = $"https://en.wikipedia.org/wiki/{title.Replace(" ", "_")}";
response = await httpClient.GetAsync(url);
var htmlContent = await response.Content.ReadAsStringAsync();

var pageDocument = new HtmlDocument();
pageDocument.LoadHtml(htmlContent);

string text = null;
var nodes = pageDocument.DocumentNode.SelectNodes("//div[contains(@id, 'mw-content-text')]//p");
if (nodes != null)
{
    text = nodes.First(n => !n.InnerText.StartsWith("\n")).InnerText;
}

while (text.Length > 500)
{
    text = text.Remove(text.LastIndexOf("."));
}
// remove the references [1]
text = Regex.Replace(text, @"&#91;\d{1,3}&#93;", "");
if (!text.Trim().EndsWith("."))
{
    text += ".";
}

if (text.Contains("may refer to:"))
{
    text = "Your search is a bit ambiguous, could you specify your search terms?";
}