2024-01-19 21:00:03 +02:00
|
|
|
package parser
|
2024-01-20 20:33:49 +02:00
|
|
|
|
|
|
|
import (
|
2024-01-21 12:10:06 +02:00
|
|
|
"context"
|
2024-01-20 20:33:49 +02:00
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"log/slog"
|
2024-01-21 12:10:06 +02:00
|
|
|
"net/http"
|
2024-01-20 20:33:49 +02:00
|
|
|
"net/url"
|
|
|
|
"recipes/internal/domain/models"
|
2024-01-21 12:10:06 +02:00
|
|
|
"recipes/internal/lib/stringcv"
|
2024-01-20 20:33:49 +02:00
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
"sync"
|
|
|
|
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
2024-01-21 12:10:06 +02:00
|
|
|
"github.com/google/uuid"
|
2024-01-20 20:33:49 +02:00
|
|
|
"github.com/s32x/httpclient"
|
|
|
|
)
|
|
|
|
|
|
|
|
const baseUrl string = "https://www.vsegdavkusno.ru"
|
|
|
|
|
|
|
|
var client *httpclient.Client = httpclient.New().WithBaseURL(baseUrl)
|
|
|
|
|
|
|
|
var PHPSESSID string
|
|
|
|
var parseKey string
|
|
|
|
|
2024-01-20 20:49:14 +02:00
|
|
|
var (
|
2024-01-22 14:55:26 +02:00
|
|
|
ErrKeyNotFound = errors.New("key not found")
|
|
|
|
ErrCookieNotFound = errors.New("cookie not found")
|
|
|
|
ErrNotSuccessReq = errors.New("not success request")
|
|
|
|
ErrEmptyLink = errors.New("empty link")
|
|
|
|
ErrRecipeExists = errors.New("recipe already exists")
|
|
|
|
ErrFailUpdatePHPSESSID = errors.New("failed to update PHPSESSID")
|
|
|
|
ErrFailUpdateKEY = errors.New("failed to update KEY")
|
2024-01-20 20:49:14 +02:00
|
|
|
)
|
|
|
|
|
2024-01-21 12:10:06 +02:00
|
|
|
type pictureSaver interface {
|
|
|
|
SaveRecipeImage(ctx context.Context, imageFile io.Reader, filename string, contentType string, fileSize int64) error
|
|
|
|
}
|
|
|
|
|
|
|
|
type recipeSaver interface {
|
|
|
|
AddRecipe(ctx context.Context, recipe models.Recipe) error
|
|
|
|
}
|
|
|
|
|
|
|
|
type recipeProvider interface {
|
|
|
|
RecipeExists(ctx context.Context, title string) (bool, error)
|
|
|
|
}
|
|
|
|
|
2024-01-20 20:33:49 +02:00
|
|
|
// SaveAllPages saves all pages to storage.
|
2024-01-21 12:52:21 +02:00
|
|
|
func SaveAllPages(log *slog.Logger, ps pictureSaver, rs recipeSaver, rp recipeProvider) error {
|
2024-01-20 20:33:49 +02:00
|
|
|
const op = "parser.SaveAllPages"
|
|
|
|
// get total
|
|
|
|
log.Debug("Сохраняю страницу 1...")
|
2024-01-21 12:10:06 +02:00
|
|
|
total, err := SavePage(log, 1, ps, rs, rp)
|
2024-01-20 20:33:49 +02:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("%s: %w", op, err)
|
|
|
|
}
|
|
|
|
fmt.Println("Total =", total)
|
|
|
|
for i := 2; i <= total; i++ {
|
|
|
|
log.Debug(fmt.Sprintf("Сохраняю страницу %d...\n", i))
|
2024-01-21 12:10:06 +02:00
|
|
|
_, err = SavePage(log, i, ps, rs, rp)
|
2024-01-21 14:48:47 +02:00
|
|
|
if err != nil {
|
|
|
|
log.Error("Страница не сохранена", "err", fmt.Errorf("%s: %w", op, err))
|
|
|
|
continue
|
|
|
|
}
|
2024-01-20 20:33:49 +02:00
|
|
|
log.Debug(fmt.Sprintf("Страница %d сохранена\n", i))
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// SavePage saves page to storage.
|
2024-01-21 12:52:21 +02:00
|
|
|
func SavePage(log *slog.Logger, page int, ps pictureSaver, rs recipeSaver, rp recipeProvider) (int, error) {
|
2024-01-20 20:33:49 +02:00
|
|
|
const op = "parser.SavePage"
|
|
|
|
|
|
|
|
var resp GetPageResp
|
|
|
|
var body io.Reader
|
|
|
|
for i := 0; i <= 3; i++ {
|
|
|
|
// make form
|
|
|
|
form := make(url.Values)
|
|
|
|
form.Add("page", fmt.Sprint(page))
|
|
|
|
form.Add("action", "filter")
|
|
|
|
form.Add("pageId", "7")
|
|
|
|
form.Add("key", parseKey)
|
|
|
|
// send request
|
|
|
|
err := client.
|
|
|
|
WithHeader("Cookie", fmt.Sprintf("PHPSESSID=%s", PHPSESSID)).
|
|
|
|
WithHeader("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8").
|
|
|
|
Post("/assets/components/msearch2/action.php").WithForm(form).JSON(&resp)
|
|
|
|
if err != nil {
|
|
|
|
return 0, fmt.Errorf("%s: %w", op, err)
|
|
|
|
}
|
2024-01-21 14:48:47 +02:00
|
|
|
// update PHPSESSID and KEY
|
2024-01-20 20:33:49 +02:00
|
|
|
if resp.Message == "Could not load config" {
|
2024-01-21 14:48:47 +02:00
|
|
|
err = GetPHPSESSID(log)
|
|
|
|
if err != nil {
|
2024-01-22 14:55:26 +02:00
|
|
|
return 0, fmt.Errorf("%s: %w", op, ErrFailUpdatePHPSESSID)
|
2024-01-21 14:48:47 +02:00
|
|
|
}
|
|
|
|
err = GetKey(log)
|
|
|
|
if err != nil {
|
2024-01-22 14:55:26 +02:00
|
|
|
return 0, fmt.Errorf("%s: %w", op, ErrFailUpdateKEY)
|
2024-01-21 14:48:47 +02:00
|
|
|
}
|
2024-01-20 20:33:49 +02:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
if !resp.Success {
|
2024-01-22 14:55:26 +02:00
|
|
|
return 0, fmt.Errorf("%s: %w", op, ErrNotSuccessReq)
|
2024-01-20 20:33:49 +02:00
|
|
|
}
|
|
|
|
body = strings.NewReader(resp.Data.Results)
|
|
|
|
break
|
|
|
|
}
|
|
|
|
doc, err := goquery.NewDocumentFromReader(body)
|
|
|
|
if err != nil {
|
|
|
|
return 0, fmt.Errorf("%s: %w", op, err)
|
|
|
|
}
|
|
|
|
var recipes []models.Recipe
|
|
|
|
doc.Find("div.recipe-card").Each(func(i int, s *goquery.Selection) {
|
|
|
|
var recipe models.Recipe
|
|
|
|
recipe.Title = s.Find("div.recipe-card__title").Text()
|
|
|
|
// recipe.Time = strings.ReplaceAll(strings.ReplaceAll(s.Find("div.recipe-card__time").Text(), "\n", ""), " ", "")
|
|
|
|
recipe.Image, _ = s.Find("img").Attr("src")
|
|
|
|
recipe.Image = strings.Replace(recipe.Image, "image_366", "image_732", 1)
|
|
|
|
recipe.Image = fmt.Sprintf("%s%s", baseUrl, recipe.Image)
|
|
|
|
recipe.Link, _ = s.Find("a.recipe-card__link").Attr("href")
|
|
|
|
recipe.Link = fmt.Sprintf("/%s", recipe.Link)
|
|
|
|
|
|
|
|
recipes = append(recipes, recipe)
|
|
|
|
})
|
|
|
|
|
|
|
|
var wg sync.WaitGroup
|
|
|
|
wg.Add(len(recipes))
|
|
|
|
for i := 0; i < len(recipes); i++ {
|
2024-01-21 12:52:21 +02:00
|
|
|
go func(i int, log *slog.Logger) {
|
2024-01-20 20:33:49 +02:00
|
|
|
defer wg.Done()
|
2024-01-21 12:10:06 +02:00
|
|
|
err = GetRecipe(&recipes[i], ps, rs, rp)
|
2024-01-20 20:33:49 +02:00
|
|
|
if err != nil {
|
2024-01-22 14:55:26 +02:00
|
|
|
if errors.Is(err, ErrRecipeExists) {
|
2024-01-21 12:52:21 +02:00
|
|
|
log.Warn("Recipe already exists")
|
|
|
|
return
|
|
|
|
}
|
2024-01-21 12:10:06 +02:00
|
|
|
log.Error("Failed to get recipe", "err", fmt.Errorf("%s: %w", op, err))
|
2024-01-20 20:33:49 +02:00
|
|
|
}
|
2024-01-21 12:10:06 +02:00
|
|
|
}(i, log)
|
2024-01-20 20:33:49 +02:00
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
return resp.Data.Pages, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetRecipe gets recipe info and saves recipe to storage.
|
2024-01-21 12:10:06 +02:00
|
|
|
func GetRecipe(r *models.Recipe, ps pictureSaver, rs recipeSaver, rp recipeProvider) error {
|
2024-01-20 20:33:49 +02:00
|
|
|
const op = "parser.GetRecipe"
|
|
|
|
|
|
|
|
if r.Link == "" {
|
2024-01-22 14:55:26 +02:00
|
|
|
return fmt.Errorf("%s: %w", op, ErrEmptyLink)
|
2024-01-20 20:33:49 +02:00
|
|
|
}
|
|
|
|
// send request
|
|
|
|
body, err := client.
|
|
|
|
WithHeader("Cookie", fmt.Sprintf("PHPSESSID=%s", PHPSESSID)).
|
|
|
|
Post(r.Link).String()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("%s: %w", op, err)
|
|
|
|
}
|
|
|
|
bodyr := strings.NewReader(body)
|
|
|
|
doc, err := goquery.NewDocumentFromReader(bodyr)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("%s: %w", op, err)
|
|
|
|
}
|
|
|
|
// описание
|
|
|
|
r.Description = strings.ReplaceAll(strings.ReplaceAll(strings.ReplaceAll(doc.Find("div.card-binfo__description").Text(), " ", ""), "\n", ""), "\t", "")
|
|
|
|
// Время готовки, кол-во порций
|
|
|
|
doc.Find("div.recipe-summary-items>div.recipe-summary-item").Each(func(i int, s *goquery.Selection) {
|
|
|
|
label := s.Find("div.recipe-summary-item__label").Text()
|
|
|
|
value := strings.ReplaceAll(s.Find("div.recipe-summary-item__value").Text(), "\n", "")
|
|
|
|
switch label {
|
|
|
|
case "Время приготовления:":
|
|
|
|
r.CookingTime = value
|
|
|
|
case "Количество порций:":
|
|
|
|
count, _ := strconv.Atoi(value)
|
|
|
|
r.ServingsNum = uint(count)
|
|
|
|
case "Калорийность:":
|
|
|
|
r.Calories = value
|
|
|
|
}
|
|
|
|
})
|
|
|
|
doc.Find("div.ingredients").Each(func(i int, s *goquery.Selection) {
|
|
|
|
// ингридиенты для рецепта
|
|
|
|
var ingredients models.RecipeIngredients
|
|
|
|
ingredients.Title = s.Find("div.ingredients__title").Text()
|
|
|
|
s.Find("ul.ingredients__list>li").Each(func(i int, q *goquery.Selection) {
|
|
|
|
ingredient := strings.ReplaceAll(strings.ReplaceAll(q.Text(), " ", ""), "\n", "")
|
|
|
|
ingredients.Ingredients = append(ingredients.Ingredients, ingredient)
|
|
|
|
})
|
|
|
|
if len(ingredients.Ingredients) != 0 {
|
|
|
|
r.Ingredients = append(r.Ingredients, ingredients)
|
|
|
|
}
|
|
|
|
// шаги рецепта
|
|
|
|
var recipe_steps []string
|
|
|
|
s.Find("div.recipe-rich>ol>li>span").Each(func(i int, q *goquery.Selection) {
|
|
|
|
recipe_step := strings.ReplaceAll(strings.ReplaceAll(q.Text(), " ", ""), "\n", "")
|
|
|
|
recipe_steps = append(recipe_steps, recipe_step)
|
|
|
|
})
|
|
|
|
if len(recipe_steps) != 0 {
|
|
|
|
r.Recipe_steps = recipe_steps
|
|
|
|
}
|
|
|
|
// рекомендации
|
|
|
|
var advices []string
|
|
|
|
s.Find("div.recipe-footer__additional-text>p").Each(func(i int, q *goquery.Selection) {
|
|
|
|
if q.Find("br").Length() > 0 {
|
2024-01-20 20:49:14 +02:00
|
|
|
html, _ := q.Html()
|
|
|
|
// if err != nil {
|
|
|
|
// return fmt.Errorf("%s: %w", op, err)
|
|
|
|
// }
|
2024-01-20 20:33:49 +02:00
|
|
|
advice_list := q.SetHtml(strings.Replace(html, "<br/>", "\n", -1)).Text()
|
|
|
|
advices_arr := strings.Split(advice_list, "\n")
|
|
|
|
var advices_arr_res []string = make([]string, 0, len(advices_arr))
|
|
|
|
for _, a := range advices_arr {
|
|
|
|
if a != "" {
|
|
|
|
advices_arr_res = append(advices_arr_res, a)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
advices = append(advices, advices_arr_res...)
|
|
|
|
} else {
|
|
|
|
var advice string = q.Text()
|
|
|
|
advices = append(advices, advice)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
if len(advices) != 0 {
|
|
|
|
// fmt.Printf("LEN ADVICES = %d", len(advices))
|
|
|
|
r.Advices = advices
|
|
|
|
}
|
|
|
|
})
|
|
|
|
// категории
|
|
|
|
doc.Find("div.similar-items>a.similar-items__link").Each(func(i int, s *goquery.Selection) {
|
|
|
|
r.Categories = append(r.Categories, s.Text())
|
|
|
|
})
|
|
|
|
// check recipe exists
|
2024-01-21 12:10:06 +02:00
|
|
|
ex, err := rp.RecipeExists(context.Background(), r.Title) // interface!
|
2024-01-20 20:33:49 +02:00
|
|
|
if err != nil || ex {
|
2024-01-22 14:55:26 +02:00
|
|
|
return fmt.Errorf("%s: %w", op, ErrRecipeExists)
|
2024-01-20 20:33:49 +02:00
|
|
|
}
|
|
|
|
// save picture
|
2024-01-21 12:10:06 +02:00
|
|
|
err = SaveRecipePicture(r, ps)
|
|
|
|
if err != nil {
|
2024-01-21 12:52:21 +02:00
|
|
|
return fmt.Errorf("%s: %w", op, err)
|
2024-01-20 20:33:49 +02:00
|
|
|
}
|
|
|
|
// insert recipe
|
2024-01-21 12:10:06 +02:00
|
|
|
err = rs.AddRecipe(context.Background(), *r)
|
2024-01-21 12:52:21 +02:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("%s: %w", op, err)
|
|
|
|
}
|
|
|
|
return nil
|
2024-01-20 20:33:49 +02:00
|
|
|
}
|
|
|
|
|
2024-01-21 12:10:06 +02:00
|
|
|
func SaveRecipePicture(r *models.Recipe, ps pictureSaver) error {
|
|
|
|
const op = "parser.SaveRecipePicture"
|
2024-01-20 20:33:49 +02:00
|
|
|
|
2024-01-21 12:10:06 +02:00
|
|
|
resp, err := http.Get(r.Image)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("%s: %w", op, err)
|
|
|
|
}
|
|
|
|
defer resp.Body.Close()
|
|
|
|
content_len, _ := strconv.ParseInt(resp.Header["Content-Length"][0], 10, 64)
|
|
|
|
// change name to generated uuid
|
|
|
|
filename := stringcv.RenameFile(stringcv.GetFilenameFromUrl(r.Image), uuid.NewString())
|
|
|
|
// upload to storage
|
|
|
|
err = ps.SaveRecipeImage(context.Background(), resp.Body, filename, resp.Header["Content-Type"][0], content_len)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
// change to filename
|
|
|
|
r.Image = filename
|
|
|
|
return err
|
|
|
|
}
|
2024-01-20 20:33:49 +02:00
|
|
|
|
|
|
|
// GetKey gets
|
2024-01-21 12:52:21 +02:00
|
|
|
func GetKey(log *slog.Logger) error {
|
2024-01-20 20:33:49 +02:00
|
|
|
const op = "parser.GetKey"
|
|
|
|
|
|
|
|
log.Debug("Updating KEY...")
|
|
|
|
defer log.Debug("KEY updated")
|
|
|
|
str, err := client.WithHeader("Cookie", fmt.Sprintf("PHPSESSID=%s", PHPSESSID)).Post("/recipes").String()
|
|
|
|
if err != nil {
|
|
|
|
|
|
|
|
return fmt.Errorf("%s: %w", op, err)
|
|
|
|
}
|
|
|
|
i := strings.Index(str, "\"key\":")
|
|
|
|
if i != 0 {
|
|
|
|
parseKey = str[i+7 : i+47]
|
2024-01-21 12:52:21 +02:00
|
|
|
log.Debug("New KEY", "key", parseKey)
|
2024-01-20 20:33:49 +02:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2024-01-22 14:55:26 +02:00
|
|
|
return fmt.Errorf("%s: %w", op, ErrKeyNotFound)
|
2024-01-20 20:33:49 +02:00
|
|
|
}
|
|
|
|
|
2024-01-21 12:52:21 +02:00
|
|
|
func GetPHPSESSID(log *slog.Logger) error {
|
2024-01-20 20:33:49 +02:00
|
|
|
const op = "parser.GetPHPSESSID"
|
|
|
|
|
|
|
|
log.Debug("Updating PHPSESSID...")
|
|
|
|
defer log.Debug("PHPSESSID updated")
|
|
|
|
form := make(url.Values)
|
|
|
|
form.Add("page", "1")
|
|
|
|
form.Add("action", "filter")
|
|
|
|
form.Add("pageId", "7")
|
|
|
|
form.Add("key", "-")
|
|
|
|
resp, err := client.
|
|
|
|
WithHeader("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8").
|
|
|
|
WithHeader("Cookie", "").
|
|
|
|
Post("/assets/components/msearch2/action.php").WithForm(form).Do()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("%s: %w", op, err)
|
|
|
|
}
|
|
|
|
for _, c := range resp.Response().Cookies() {
|
|
|
|
if c.Name == "PHPSESSID" {
|
|
|
|
PHPSESSID = c.Value
|
2024-01-21 12:52:21 +02:00
|
|
|
log.Debug("New PHPSESSID", "PHPSESSID", PHPSESSID)
|
2024-01-20 20:33:49 +02:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-01-22 14:55:26 +02:00
|
|
|
return fmt.Errorf("%s: %w", op, ErrCookieNotFound)
|
2024-01-20 20:33:49 +02:00
|
|
|
}
|